LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true) {
4456 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4457 unsigned NumSubs = 1;
4458 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4459 (!CheckBWI && Subtarget.useAVX512Regs())) {
4460 if (VT.getSizeInBits() > 512) {
4461 NumSubs = VT.getSizeInBits() / 512;
4462 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4463 }
4464 } else if (Subtarget.hasAVX2()) {
4465 if (VT.getSizeInBits() > 256) {
4466 NumSubs = VT.getSizeInBits() / 256;
4467 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4468 }
4469 } else {
4470 if (VT.getSizeInBits() > 128) {
4471 NumSubs = VT.getSizeInBits() / 128;
4472 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4473 }
4474 }
4475
4476 if (NumSubs == 1)
4477 return Builder(DAG, DL, Ops);
4478
4480 for (unsigned i = 0; i != NumSubs; ++i) {
4482 for (SDValue Op : Ops) {
4483 EVT OpVT = Op.getValueType();
4484 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4485 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4486 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4487 }
4488 Subs.push_back(Builder(DAG, DL, SubOps));
4489 }
4490 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4491}
4492
4493// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4494// targets.
4495static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4497 const X86Subtarget &Subtarget) {
4498 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4499 MVT SVT = VT.getScalarType();
4500
4501 // If we have a 32/64 splatted constant, splat it to DstTy to
4502 // encourage a foldable broadcast'd operand.
4503 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4504 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4505 // AVX512 broadcasts 32/64-bit operands.
4506 // TODO: Support float once getAVX512Node is used by fp-ops.
4507 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4509 return SDValue();
4510 // If we're not widening, don't bother if we're not bitcasting.
4511 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4512 return SDValue();
4514 APInt SplatValue, SplatUndef;
4515 unsigned SplatBitSize;
4516 bool HasAnyUndefs;
4517 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4518 HasAnyUndefs, OpEltSizeInBits) &&
4519 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4520 return DAG.getConstant(SplatValue, DL, DstVT);
4521 }
4522 return SDValue();
4523 };
4524
4525 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4526
4527 MVT DstVT = VT;
4528 if (Widen)
4529 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4530
4531 // Canonicalize src operands.
4532 SmallVector<SDValue> SrcOps(Ops);
4533 for (SDValue &Op : SrcOps) {
4534 MVT OpVT = Op.getSimpleValueType();
4535 // Just pass through scalar operands.
4536 if (!OpVT.isVector())
4537 continue;
4538 assert(OpVT == VT && "Vector type mismatch");
4539
4540 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4541 Op = BroadcastOp;
4542 continue;
4543 }
4544
4545 // Just widen the subvector by inserting into an undef wide vector.
4546 if (Widen)
4547 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4548 }
4549
4550 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4551
4552 // Perform the 512-bit op then extract the bottom subvector.
4553 if (Widen)
4554 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4555 return Res;
4556}
4557
4558/// Insert i1-subvector to i1-vector.
4560 const X86Subtarget &Subtarget) {
4561
4562 SDLoc dl(Op);
4563 SDValue Vec = Op.getOperand(0);
4564 SDValue SubVec = Op.getOperand(1);
4565 SDValue Idx = Op.getOperand(2);
4566 unsigned IdxVal = Op.getConstantOperandVal(2);
4567
4568 // Inserting undef is a nop. We can just return the original vector.
4569 if (SubVec.isUndef())
4570 return Vec;
4571
4572 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4573 return Op;
4574
4575 MVT OpVT = Op.getSimpleValueType();
4576 unsigned NumElems = OpVT.getVectorNumElements();
4577 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4578
4579 // Extend to natively supported kshift.
4580 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4581
4582 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4583 // if necessary.
4584 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4585 // May need to promote to a legal type.
4586 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587 DAG.getConstant(0, dl, WideOpVT),
4588 SubVec, Idx);
4589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4590 }
4591
4592 MVT SubVecVT = SubVec.getSimpleValueType();
4593 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4594 assert(IdxVal + SubVecNumElems <= NumElems &&
4595 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4596 "Unexpected index value in INSERT_SUBVECTOR");
4597
4598 SDValue Undef = DAG.getUNDEF(WideOpVT);
4599
4600 if (IdxVal == 0) {
4601 // Zero lower bits of the Vec
4602 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4603 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4604 ZeroIdx);
4605 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4606 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4607 // Merge them together, SubVec should be zero extended.
4608 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4609 DAG.getConstant(0, dl, WideOpVT),
4610 SubVec, ZeroIdx);
4611 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4612 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4613 }
4614
4615 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4616 Undef, SubVec, ZeroIdx);
4617
4618 if (Vec.isUndef()) {
4619 assert(IdxVal != 0 && "Unexpected index");
4620 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4621 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4623 }
4624
4626 assert(IdxVal != 0 && "Unexpected index");
4627 // If upper elements of Vec are known undef, then just shift into place.
4628 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4629 [](SDValue V) { return V.isUndef(); })) {
4630 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4631 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4632 } else {
4633 NumElems = WideOpVT.getVectorNumElements();
4634 unsigned ShiftLeft = NumElems - SubVecNumElems;
4635 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4636 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4637 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4638 if (ShiftRight != 0)
4639 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4640 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4641 }
4642 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4643 }
4644
4645 // Simple case when we put subvector in the upper part
4646 if (IdxVal + SubVecNumElems == NumElems) {
4647 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4648 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4649 if (SubVecNumElems * 2 == NumElems) {
4650 // Special case, use legal zero extending insert_subvector. This allows
4651 // isel to optimize when bits are known zero.
4652 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4653 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4654 DAG.getConstant(0, dl, WideOpVT),
4655 Vec, ZeroIdx);
4656 } else {
4657 // Otherwise use explicit shifts to zero the bits.
4658 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4659 Undef, Vec, ZeroIdx);
4660 NumElems = WideOpVT.getVectorNumElements();
4661 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4662 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4663 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4664 }
4665 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4667 }
4668
4669 // Inserting into the middle is more complicated.
4670
4671 NumElems = WideOpVT.getVectorNumElements();
4672
4673 // Widen the vector if needed.
4674 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4675
4676 unsigned ShiftLeft = NumElems - SubVecNumElems;
4677 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4678
4679 // Do an optimization for the most frequently used types.
4680 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4681 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4682 Mask0.flipAllBits();
4683 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4684 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4685 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4686 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4687 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4688 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4689 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4690 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4691
4692 // Reduce to original width if needed.
4693 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4694 }
4695
4696 // Clear the upper bits of the subvector and move it to its insert position.
4697 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4699 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4700 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4701
4702 // Isolate the bits below the insertion point.
4703 unsigned LowShift = NumElems - IdxVal;
4704 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4705 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4706 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4707 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4708
4709 // Isolate the bits after the last inserted bit.
4710 unsigned HighShift = IdxVal + SubVecNumElems;
4711 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4712 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4713 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4714 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4715
4716 // Now OR all 3 pieces together.
4717 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4718 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4719
4720 // Reduce to original width if needed.
4721 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4722}
4723
4725 const SDLoc &dl) {
4726 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4727 EVT SubVT = V1.getValueType();
4728 EVT SubSVT = SubVT.getScalarType();
4729 unsigned SubNumElts = SubVT.getVectorNumElements();
4730 unsigned SubVectorWidth = SubVT.getSizeInBits();
4731 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4732 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4733 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4734}
4735
4736/// Returns a vector of specified type with all bits set.
4737/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4738/// Then bitcast to their original type, ensuring they get CSE'd.
4739static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4740 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4741 "Expected a 128/256/512-bit vector type");
4742 unsigned NumElts = VT.getSizeInBits() / 32;
4743 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4744 return DAG.getBitcast(VT, Vec);
4745}
4746
4747static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4748 SDValue In, SelectionDAG &DAG) {
4749 EVT InVT = In.getValueType();
4750 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4751
4752 // Canonicalize Opcode to general extension version.
4753 switch (Opcode) {
4754 case ISD::ANY_EXTEND:
4756 Opcode = ISD::ANY_EXTEND;
4757 break;
4758 case ISD::SIGN_EXTEND:
4760 Opcode = ISD::SIGN_EXTEND;
4761 break;
4762 case ISD::ZERO_EXTEND:
4764 Opcode = ISD::ZERO_EXTEND;
4765 break;
4766 default:
4767 llvm_unreachable("Unknown extension opcode");
4768 }
4769
4770 // For 256-bit vectors, we only need the lower (128-bit) input half.
4771 // For 512-bit vectors, we only need the lower input half or quarter.
4772 if (InVT.getSizeInBits() > 128) {
4773 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4774 "Expected VTs to be the same size!");
4775 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4776 In = extractSubVector(In, 0, DAG, DL,
4777 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4778 InVT = In.getValueType();
4779 }
4780
4781 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4782 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4783
4784 return DAG.getNode(Opcode, DL, VT, In);
4785}
4786
4787// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4789 SDValue Mask, SelectionDAG &DAG) {
4790 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4791 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4792 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4793}
4794
4796 bool Lo, bool Unary) {
4797 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4798 "Illegal vector type to unpack");
4799 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4800 int NumElts = VT.getVectorNumElements();
4801 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4802 for (int i = 0; i < NumElts; ++i) {
4803 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4804 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4805 Pos += (Unary ? 0 : NumElts * (i % 2));
4806 Pos += (Lo ? 0 : NumEltsInLane / 2);
4807 Mask.push_back(Pos);
4808 }
4809}
4810
4811/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4812/// imposed by AVX and specific to the unary pattern. Example:
4813/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4814/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4816 bool Lo) {
4817 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4818 int NumElts = VT.getVectorNumElements();
4819 for (int i = 0; i < NumElts; ++i) {
4820 int Pos = i / 2;
4821 Pos += (Lo ? 0 : NumElts / 2);
4822 Mask.push_back(Pos);
4823 }
4824}
4825
4826// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4827static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4828 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4831 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4832 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4833 int M = Mask[I];
4834 if (M < 0)
4835 continue;
4836 SDValue V = (M < NumElts) ? V1 : V2;
4837 if (V.isUndef())
4838 continue;
4839 Ops[I] = V.getOperand(M % NumElts);
4840 }
4841 return DAG.getBuildVector(VT, dl, Ops);
4842 }
4843
4844 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4845}
4846
4847/// Returns a vector_shuffle node for an unpackl operation.
4848static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4849 SDValue V1, SDValue V2) {
4851 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4852 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4853}
4854
4855/// Returns a vector_shuffle node for an unpackh operation.
4856static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4857 SDValue V1, SDValue V2) {
4859 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4860 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4861}
4862
4863/// Returns a node that packs the LHS + RHS nodes together at half width.
4864/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4865/// TODO: Add subvector splitting if/when we have a need for it.
4866static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4867 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4868 bool PackHiHalf = false) {
4869 MVT OpVT = LHS.getSimpleValueType();
4870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4871 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4872 assert(OpVT == RHS.getSimpleValueType() &&
4873 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4874 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4875 "Unexpected PACK operand types");
4876 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4877 "Unexpected PACK result type");
4878
4879 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4880 if (EltSizeInBits == 32) {
4881 SmallVector<int> PackMask;
4882 int Offset = PackHiHalf ? 1 : 0;
4883 int NumElts = VT.getVectorNumElements();
4884 for (int I = 0; I != NumElts; I += 4) {
4885 PackMask.push_back(I + Offset);
4886 PackMask.push_back(I + Offset + 2);
4887 PackMask.push_back(I + Offset + NumElts);
4888 PackMask.push_back(I + Offset + NumElts + 2);
4889 }
4890 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4891 DAG.getBitcast(VT, RHS), PackMask);
4892 }
4893
4894 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4895 if (!PackHiHalf) {
4896 if (UsePackUS &&
4897 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4898 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4899 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4900
4901 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4902 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4904 }
4905
4906 // Fallback to sign/zero extending the requested half and pack.
4907 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4908 if (UsePackUS) {
4909 if (PackHiHalf) {
4910 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4911 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4912 } else {
4913 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4914 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4915 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4916 };
4917 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4918 };
4919
4920 if (!PackHiHalf) {
4921 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4922 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4923 }
4924 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4925 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4926 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4927}
4928
4929/// Return a vector_shuffle of the specified vector of zero or undef vector.
4930/// This produces a shuffle where the low element of V2 is swizzled into the
4931/// zero/undef vector, landing at element Idx.
4932/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4934 bool IsZero,
4935 const X86Subtarget &Subtarget,
4936 SelectionDAG &DAG) {
4937 MVT VT = V2.getSimpleValueType();
4938 SDValue V1 = IsZero
4939 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4940 int NumElems = VT.getVectorNumElements();
4941 SmallVector<int, 16> MaskVec(NumElems);
4942 for (int i = 0; i != NumElems; ++i)
4943 // If this is the insertion idx, put the low elt of V2 here.
4944 MaskVec[i] = (i == Idx) ? NumElems : i;
4945 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4946}
4947
4949 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4950 Ptr.getOpcode() == X86ISD::WrapperRIP)
4951 Ptr = Ptr.getOperand(0);
4953}
4954
4955// TODO: Add support for non-zero offsets.
4958 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4959 return nullptr;
4960 return CNode->getConstVal();
4961}
4962
4964 if (!Load || !ISD::isNormalLoad(Load))
4965 return nullptr;
4966 return getTargetConstantFromBasePtr(Load->getBasePtr());
4967}
4968
4973
4974const Constant *
4976 assert(LD && "Unexpected null LoadSDNode");
4977 return getTargetConstantFromNode(LD);
4978}
4979
4981 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4982 SDValue Cond = N->getOperand(0);
4983 SDValue RHS = N->getOperand(2);
4984 EVT CondVT = Cond.getValueType();
4985 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4986 CondVT.getVectorElementType() == MVT::i1 &&
4987 ISD::isBuildVectorAllZeros(RHS.getNode());
4988}
4989
4990// Extract raw constant bits from constant pools.
4991static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4992 APInt &UndefElts,
4993 SmallVectorImpl<APInt> &EltBits,
4994 bool AllowWholeUndefs = true,
4995 bool AllowPartialUndefs = false) {
4996 assert(EltBits.empty() && "Expected an empty EltBits vector");
4997
4999
5000 EVT VT = Op.getValueType();
5001 unsigned SizeInBits = VT.getSizeInBits();
5002 unsigned NumElts = SizeInBits / EltSizeInBits;
5003
5004 // Can't split constant.
5005 if ((SizeInBits % EltSizeInBits) != 0)
5006 return false;
5007
5008 // Bitcast a source array of element bits to the target size.
5009 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5010 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5011 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5012 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5013 "Constant bit sizes don't match");
5014
5015 // Don't split if we don't allow undef bits.
5016 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5017 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5018 return false;
5019
5020 // If we're already the right size, don't bother bitcasting.
5021 if (NumSrcElts == NumElts) {
5022 UndefElts = UndefSrcElts;
5023 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5024 return true;
5025 }
5026
5027 // Extract all the undef/constant element data and pack into single bitsets.
5028 APInt UndefBits(SizeInBits, 0);
5029 APInt MaskBits(SizeInBits, 0);
5030
5031 for (unsigned i = 0; i != NumSrcElts; ++i) {
5032 unsigned BitOffset = i * SrcEltSizeInBits;
5033 if (UndefSrcElts[i])
5034 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5035 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5036 }
5037
5038 // Split the undef/constant single bitset data into the target elements.
5039 UndefElts = APInt(NumElts, 0);
5040 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5041
5042 for (unsigned i = 0; i != NumElts; ++i) {
5043 unsigned BitOffset = i * EltSizeInBits;
5044 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5045
5046 // Only treat an element as UNDEF if all bits are UNDEF.
5047 if (UndefEltBits.isAllOnes()) {
5048 if (!AllowWholeUndefs)
5049 return false;
5050 UndefElts.setBit(i);
5051 continue;
5052 }
5053
5054 // If only some bits are UNDEF then treat them as zero (or bail if not
5055 // supported).
5056 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5057 return false;
5058
5059 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5060 }
5061 return true;
5062 };
5063
5064 // Collect constant bits and insert into mask/undef bit masks.
5065 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5066 unsigned UndefBitIndex) {
5067 if (!Cst)
5068 return false;
5069 if (isa<UndefValue>(Cst)) {
5070 Undefs.setBit(UndefBitIndex);
5071 return true;
5072 }
5073 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5074 Mask = CInt->getValue();
5075 return true;
5076 }
5077 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5078 Mask = CFP->getValueAPF().bitcastToAPInt();
5079 return true;
5080 }
5081 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5082 Type *Ty = CDS->getType();
5083 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5084 Type *EltTy = CDS->getElementType();
5085 bool IsInteger = EltTy->isIntegerTy();
5086 bool IsFP =
5087 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5088 if (!IsInteger && !IsFP)
5089 return false;
5090 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5091 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5092 if (IsInteger)
5093 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5094 else
5095 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5096 I * EltBits);
5097 return true;
5098 }
5099 return false;
5100 };
5101
5102 // Handle UNDEFs.
5103 if (Op.isUndef()) {
5104 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5105 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5106 return CastBitData(UndefSrcElts, SrcEltBits);
5107 }
5108
5109 // Extract scalar constant bits.
5110 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5111 APInt UndefSrcElts = APInt::getZero(1);
5112 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5113 return CastBitData(UndefSrcElts, SrcEltBits);
5114 }
5115 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5116 APInt UndefSrcElts = APInt::getZero(1);
5117 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5118 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5119 return CastBitData(UndefSrcElts, SrcEltBits);
5120 }
5121
5122 // Extract constant bits from build vector.
5123 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5124 BitVector Undefs;
5125 SmallVector<APInt> SrcEltBits;
5126 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5127 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5128 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5129 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5130 if (Undefs[I])
5131 UndefSrcElts.setBit(I);
5132 return CastBitData(UndefSrcElts, SrcEltBits);
5133 }
5134 }
5135
5136 // Extract constant bits from constant pool vector.
5137 if (auto *Cst = getTargetConstantFromNode(Op)) {
5138 Type *CstTy = Cst->getType();
5139 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5140 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5141 return false;
5142
5143 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5144 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5145 if ((SizeInBits % SrcEltSizeInBits) != 0)
5146 return false;
5147
5148 APInt UndefSrcElts(NumSrcElts, 0);
5149 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5150 for (unsigned i = 0; i != NumSrcElts; ++i)
5151 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5152 UndefSrcElts, i))
5153 return false;
5154
5155 return CastBitData(UndefSrcElts, SrcEltBits);
5156 }
5157
5158 // Extract constant bits from a broadcasted constant pool scalar.
5159 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5160 EltSizeInBits <= VT.getScalarSizeInBits()) {
5161 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5162 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5163 return false;
5164
5165 SDValue Ptr = MemIntr->getBasePtr();
5167 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5168 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5169
5170 APInt UndefSrcElts(NumSrcElts, 0);
5171 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5172 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5173 if (UndefSrcElts[0])
5174 UndefSrcElts.setBits(0, NumSrcElts);
5175 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5176 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5177 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5178 return CastBitData(UndefSrcElts, SrcEltBits);
5179 }
5180 }
5181 }
5182
5183 // Extract constant bits from a subvector broadcast.
5184 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5185 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5186 SDValue Ptr = MemIntr->getBasePtr();
5187 // The source constant may be larger than the subvector broadcast,
5188 // ensure we extract the correct subvector constants.
5189 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5190 Type *CstTy = Cst->getType();
5191 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5192 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5193 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5194 (SizeInBits % SubVecSizeInBits) != 0)
5195 return false;
5196 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5197 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5198 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5199 APInt UndefSubElts(NumSubElts, 0);
5200 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5201 APInt(CstEltSizeInBits, 0));
5202 for (unsigned i = 0; i != NumSubElts; ++i) {
5203 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5204 UndefSubElts, i))
5205 return false;
5206 for (unsigned j = 1; j != NumSubVecs; ++j)
5207 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5208 }
5209 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5210 UndefSubElts);
5211 return CastBitData(UndefSubElts, SubEltBits);
5212 }
5213 }
5214
5215 // Extract a rematerialized scalar constant insertion.
5216 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5217 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5218 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5219 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5220 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5221
5222 APInt UndefSrcElts(NumSrcElts, 0);
5223 SmallVector<APInt, 64> SrcEltBits;
5224 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5225 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5226 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5227 return CastBitData(UndefSrcElts, SrcEltBits);
5228 }
5229
5230 // Insert constant bits from a base and sub vector sources.
5231 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5232 // If bitcasts to larger elements we might lose track of undefs - don't
5233 // allow any to be safe.
5234 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5235 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5236
5237 APInt UndefSrcElts, UndefSubElts;
5238 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5239 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5240 UndefSubElts, EltSubBits,
5241 AllowWholeUndefs && AllowUndefs,
5242 AllowPartialUndefs && AllowUndefs) &&
5243 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5244 UndefSrcElts, EltSrcBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs)) {
5247 unsigned BaseIdx = Op.getConstantOperandVal(2);
5248 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5249 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5250 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5251 return CastBitData(UndefSrcElts, EltSrcBits);
5252 }
5253 }
5254
5255 // Extract constant bits from a subvector's source.
5256 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5257 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5258 EltBits, AllowWholeUndefs,
5259 AllowPartialUndefs)) {
5260 EVT SrcVT = Op.getOperand(0).getValueType();
5261 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5262 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5263 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5264 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5265 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5266 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5268
5269 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5270 if ((BaseIdx + NumSubElts) != NumSrcElts)
5271 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5272 if (BaseIdx != 0)
5273 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5274 return true;
5275 }
5276
5277 // Extract constant bits from shuffle node sources.
5278 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5279 // TODO - support shuffle through bitcasts.
5280 if (EltSizeInBits != VT.getScalarSizeInBits())
5281 return false;
5282
5283 ArrayRef<int> Mask = SVN->getMask();
5284 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5285 llvm::any_of(Mask, [](int M) { return M < 0; }))
5286 return false;
5287
5288 APInt UndefElts0, UndefElts1;
5289 SmallVector<APInt, 32> EltBits0, EltBits1;
5290 if (isAnyInRange(Mask, 0, NumElts) &&
5291 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5292 UndefElts0, EltBits0, AllowWholeUndefs,
5293 AllowPartialUndefs))
5294 return false;
5295 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5296 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5297 UndefElts1, EltBits1, AllowWholeUndefs,
5298 AllowPartialUndefs))
5299 return false;
5300
5301 UndefElts = APInt::getZero(NumElts);
5302 for (int i = 0; i != (int)NumElts; ++i) {
5303 int M = Mask[i];
5304 if (M < 0) {
5305 UndefElts.setBit(i);
5306 EltBits.push_back(APInt::getZero(EltSizeInBits));
5307 } else if (M < (int)NumElts) {
5308 if (UndefElts0[M])
5309 UndefElts.setBit(i);
5310 EltBits.push_back(EltBits0[M]);
5311 } else {
5312 if (UndefElts1[M - NumElts])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits1[M - NumElts]);
5315 }
5316 }
5317 return true;
5318 }
5319
5320 return false;
5321}
5322
5323namespace llvm {
5324namespace X86 {
5325bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5326 APInt UndefElts;
5327 SmallVector<APInt, 16> EltBits;
5329 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5330 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5331 int SplatIndex = -1;
5332 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5333 if (UndefElts[i])
5334 continue;
5335 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5336 SplatIndex = -1;
5337 break;
5338 }
5339 SplatIndex = i;
5340 }
5341 if (0 <= SplatIndex) {
5342 SplatVal = EltBits[SplatIndex];
5343 return true;
5344 }
5345 }
5346
5347 return false;
5348}
5349} // namespace X86
5350} // namespace llvm
5351
5353 unsigned MaskEltSizeInBits,
5355 APInt &UndefElts) {
5356 // Extract the raw target constant bits.
5357 SmallVector<APInt, 64> EltBits;
5358 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5359 EltBits, /* AllowWholeUndefs */ true,
5360 /* AllowPartialUndefs */ false))
5361 return false;
5362
5363 // Insert the extracted elements into the mask.
5364 for (const APInt &Elt : EltBits)
5365 RawMask.push_back(Elt.getZExtValue());
5366
5367 return true;
5368}
5369
5370static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5371 bool AllowUndefs) {
5372 APInt UndefElts;
5373 SmallVector<APInt, 64> EltBits;
5374 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5375 /*AllowWholeUndefs*/ AllowUndefs,
5376 /*AllowPartialUndefs*/ false))
5377 return false;
5378
5379 bool IsPow2OrUndef = true;
5380 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5381 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5382 return IsPow2OrUndef;
5383}
5384
5385// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5387 // TODO: don't always ignore oneuse constraints.
5388 V = peekThroughBitcasts(V);
5389 EVT VT = V.getValueType();
5390
5391 // Match not(xor X, -1) -> X.
5392 if (V.getOpcode() == ISD::XOR &&
5393 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5394 isAllOnesConstant(V.getOperand(1))))
5395 return V.getOperand(0);
5396
5397 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5398 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5399 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5400 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5401 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5403 V.getOperand(1));
5404 }
5405 }
5406
5407 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5408 if (V.getOpcode() == X86ISD::PCMPGT &&
5409 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5410 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5411 V.getOperand(0).hasOneUse()) {
5412 APInt UndefElts;
5413 SmallVector<APInt> EltBits;
5414 if (getTargetConstantBitsFromNode(V.getOperand(0),
5415 V.getScalarValueSizeInBits(), UndefElts,
5416 EltBits) &&
5417 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5418 // Don't fold min_signed_value -> (min_signed_value - 1)
5419 bool MinSigned = false;
5420 for (APInt &Elt : EltBits) {
5421 MinSigned |= Elt.isMinSignedValue();
5422 Elt -= 1;
5423 }
5424 if (!MinSigned) {
5425 SDLoc DL(V);
5426 MVT VT = V.getSimpleValueType();
5427 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5428 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5429 }
5430 }
5431 }
5432
5433 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5435 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5436 for (SDValue &CatOp : CatOps) {
5437 SDValue NotCat = IsNOT(CatOp, DAG);
5438 if (!NotCat)
5439 return SDValue();
5440 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5441 }
5442 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5443 }
5444
5445 // Match not(or(not(X),not(Y))) -> and(X, Y).
5446 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5447 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5448 // TODO: Handle cases with single NOT operand -> ANDNP
5449 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5450 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5451 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5452 DAG.getBitcast(VT, Op1));
5453 }
5454
5455 return SDValue();
5456}
5457
5458/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5459/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5460/// Note: This ignores saturation, so inputs must be checked first.
5462 bool Unary, unsigned NumStages = 1) {
5463 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5464 unsigned NumElts = VT.getVectorNumElements();
5465 unsigned NumLanes = VT.getSizeInBits() / 128;
5466 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5467 unsigned Offset = Unary ? 0 : NumElts;
5468 unsigned Repetitions = 1u << (NumStages - 1);
5469 unsigned Increment = 1u << NumStages;
5470 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5471
5472 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5473 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5474 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5475 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5476 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5477 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5478 }
5479 }
5480}
5481
5482// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5483static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5484 APInt &DemandedLHS, APInt &DemandedRHS) {
5485 int NumLanes = VT.getSizeInBits() / 128;
5486 int NumElts = DemandedElts.getBitWidth();
5487 int NumInnerElts = NumElts / 2;
5488 int NumEltsPerLane = NumElts / NumLanes;
5489 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5490
5491 DemandedLHS = APInt::getZero(NumInnerElts);
5492 DemandedRHS = APInt::getZero(NumInnerElts);
5493
5494 // Map DemandedElts to the packed operands.
5495 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5496 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5497 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5498 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5499 if (DemandedElts[OuterIdx])
5500 DemandedLHS.setBit(InnerIdx);
5501 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5502 DemandedRHS.setBit(InnerIdx);
5503 }
5504 }
5505}
5506
5507// Split the demanded elts of a HADD/HSUB node between its operands.
5508static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5509 APInt &DemandedLHS, APInt &DemandedRHS) {
5511 DemandedLHS, DemandedRHS);
5512 DemandedLHS |= DemandedLHS << 1;
5513 DemandedRHS |= DemandedRHS << 1;
5514}
5515
5516/// Calculates the shuffle mask corresponding to the target-specific opcode.
5517/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5518/// operands in \p Ops, and returns true.
5519/// Sets \p IsUnary to true if only one source is used. Note that this will set
5520/// IsUnary for shuffles which use a single input multiple times, and in those
5521/// cases it will adjust the mask to only have indices within that single input.
5522/// It is an error to call this with non-empty Mask/Ops vectors.
5523static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5525 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5526 if (!isTargetShuffle(N.getOpcode()))
5527 return false;
5528
5529 MVT VT = N.getSimpleValueType();
5530 unsigned NumElems = VT.getVectorNumElements();
5531 unsigned MaskEltSize = VT.getScalarSizeInBits();
5533 APInt RawUndefs;
5534 uint64_t ImmN;
5535
5536 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5537 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5538
5539 IsUnary = false;
5540 bool IsFakeUnary = false;
5541 switch (N.getOpcode()) {
5542 case X86ISD::BLENDI:
5543 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5544 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5545 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5546 DecodeBLENDMask(NumElems, ImmN, Mask);
5547 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5548 break;
5549 case X86ISD::SHUFP:
5550 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5551 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5552 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5553 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5554 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5555 break;
5556 case X86ISD::INSERTPS:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::EXTRQI:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5566 isa<ConstantSDNode>(N.getOperand(2))) {
5567 int BitLen = N.getConstantOperandVal(1);
5568 int BitIdx = N.getConstantOperandVal(2);
5569 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5570 IsUnary = true;
5571 }
5572 break;
5573 case X86ISD::INSERTQI:
5574 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5575 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5576 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5577 isa<ConstantSDNode>(N.getOperand(3))) {
5578 int BitLen = N.getConstantOperandVal(2);
5579 int BitIdx = N.getConstantOperandVal(3);
5580 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5581 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5582 }
5583 break;
5584 case X86ISD::UNPCKH:
5585 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5586 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5587 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5588 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5589 break;
5590 case X86ISD::UNPCKL:
5591 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5592 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5593 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5594 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5595 break;
5596 case X86ISD::MOVHLPS:
5597 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5598 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5599 DecodeMOVHLPSMask(NumElems, Mask);
5600 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5601 break;
5602 case X86ISD::MOVLHPS:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5605 DecodeMOVLHPSMask(NumElems, Mask);
5606 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5607 break;
5608 case X86ISD::VALIGN:
5609 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5610 "Only 32-bit and 64-bit elements are supported!");
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5614 DecodeVALIGNMask(NumElems, ImmN, Mask);
5615 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5616 Ops.push_back(N.getOperand(1));
5617 Ops.push_back(N.getOperand(0));
5618 break;
5619 case X86ISD::PALIGNR:
5620 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5621 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5622 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5623 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5624 DecodePALIGNRMask(NumElems, ImmN, Mask);
5625 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5626 Ops.push_back(N.getOperand(1));
5627 Ops.push_back(N.getOperand(0));
5628 break;
5629 case X86ISD::VSHLDQ:
5630 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5633 DecodePSLLDQMask(NumElems, ImmN, Mask);
5634 IsUnary = true;
5635 break;
5636 case X86ISD::VSRLDQ:
5637 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5638 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5639 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5640 DecodePSRLDQMask(NumElems, ImmN, Mask);
5641 IsUnary = true;
5642 break;
5643 case X86ISD::PSHUFD:
5644 case X86ISD::VPERMILPI:
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::PSHUFHW:
5651 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5653 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5654 IsUnary = true;
5655 break;
5656 case X86ISD::PSHUFLW:
5657 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5658 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5659 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5660 IsUnary = true;
5661 break;
5662 case X86ISD::VZEXT_MOVL:
5663 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5664 DecodeZeroMoveLowMask(NumElems, Mask);
5665 IsUnary = true;
5666 break;
5667 case X86ISD::VBROADCAST:
5668 // We only decode broadcasts of same-sized vectors, peeking through to
5669 // extracted subvectors is likely to cause hasOneUse issues with
5670 // SimplifyDemandedBits etc.
5671 if (N.getOperand(0).getValueType() == VT) {
5672 DecodeVectorBroadcast(NumElems, Mask);
5673 IsUnary = true;
5674 break;
5675 }
5676 return false;
5677 case X86ISD::VPERMILPV: {
5678 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5679 IsUnary = true;
5680 SDValue MaskNode = N.getOperand(1);
5681 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5682 RawUndefs)) {
5683 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5684 break;
5685 }
5686 return false;
5687 }
5688 case X86ISD::PSHUFB: {
5689 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5690 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5691 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5692 IsUnary = true;
5693 SDValue MaskNode = N.getOperand(1);
5694 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5695 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 case X86ISD::VPERMI:
5701 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5702 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5703 DecodeVPERMMask(NumElems, ImmN, Mask);
5704 IsUnary = true;
5705 break;
5706 case X86ISD::MOVSS:
5707 case X86ISD::MOVSD:
5708 case X86ISD::MOVSH:
5709 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5710 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5711 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5712 break;
5713 case X86ISD::VPERM2X128:
5714 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5715 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5718 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5719 break;
5720 case X86ISD::SHUF128:
5721 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5722 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5723 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5724 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5725 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5726 break;
5727 case X86ISD::MOVSLDUP:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 DecodeMOVSLDUPMask(NumElems, Mask);
5730 IsUnary = true;
5731 break;
5732 case X86ISD::MOVSHDUP:
5733 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5734 DecodeMOVSHDUPMask(NumElems, Mask);
5735 IsUnary = true;
5736 break;
5737 case X86ISD::MOVDDUP:
5738 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5739 DecodeMOVDDUPMask(NumElems, Mask);
5740 IsUnary = true;
5741 break;
5742 case X86ISD::VPERMIL2: {
5743 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5744 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5745 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5746 SDValue MaskNode = N.getOperand(2);
5747 SDValue CtrlNode = N.getOperand(3);
5748 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5749 unsigned CtrlImm = CtrlOp->getZExtValue();
5750 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5751 RawUndefs)) {
5752 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5753 Mask);
5754 break;
5755 }
5756 }
5757 return false;
5758 }
5759 case X86ISD::VPPERM: {
5760 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5761 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5762 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5763 SDValue MaskNode = N.getOperand(2);
5764 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5765 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5766 break;
5767 }
5768 return false;
5769 }
5770 case X86ISD::VPERMV: {
5771 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5772 IsUnary = true;
5773 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5774 Ops.push_back(N.getOperand(1));
5775 SDValue MaskNode = N.getOperand(0);
5776 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5777 RawUndefs)) {
5778 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5779 break;
5780 }
5781 return false;
5782 }
5783 case X86ISD::VPERMV3: {
5784 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5785 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5786 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5787 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5788 Ops.push_back(N.getOperand(0));
5789 Ops.push_back(N.getOperand(2));
5790 SDValue MaskNode = N.getOperand(1);
5791 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5792 RawUndefs)) {
5793 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5794 break;
5795 }
5796 return false;
5797 }
5798 default:
5799 llvm_unreachable("unknown target shuffle node");
5800 }
5801
5802 // Empty mask indicates the decode failed.
5803 if (Mask.empty())
5804 return false;
5805
5806 // Check if we're getting a shuffle mask with zero'd elements.
5807 if (!AllowSentinelZero && isAnyZero(Mask))
5808 return false;
5809
5810 // If we have a fake unary shuffle, the shuffle mask is spread across two
5811 // inputs that are actually the same node. Re-map the mask to always point
5812 // into the first input.
5813 if (IsFakeUnary)
5814 for (int &M : Mask)
5815 if (M >= (int)Mask.size())
5816 M -= Mask.size();
5817
5818 // If we didn't already add operands in the opcode-specific code, default to
5819 // adding 1 or 2 operands starting at 0.
5820 if (Ops.empty()) {
5821 Ops.push_back(N.getOperand(0));
5822 if (!IsUnary || IsFakeUnary)
5823 Ops.push_back(N.getOperand(1));
5824 }
5825
5826 return true;
5827}
5828
5829// Wrapper for getTargetShuffleMask with InUnary;
5830static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5832 SmallVectorImpl<int> &Mask) {
5833 bool IsUnary;
5834 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5835}
5836
5837/// Compute whether each element of a shuffle is zeroable.
5838///
5839/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5840/// Either it is an undef element in the shuffle mask, the element of the input
5841/// referenced is undef, or the element of the input referenced is known to be
5842/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5843/// as many lanes with this technique as possible to simplify the remaining
5844/// shuffle.
5846 SDValue V1, SDValue V2,
5847 APInt &KnownUndef, APInt &KnownZero) {
5848 int Size = Mask.size();
5849 KnownUndef = KnownZero = APInt::getZero(Size);
5850
5851 V1 = peekThroughBitcasts(V1);
5852 V2 = peekThroughBitcasts(V2);
5853
5854 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5855 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5856
5857 int VectorSizeInBits = V1.getValueSizeInBits();
5858 int ScalarSizeInBits = VectorSizeInBits / Size;
5859 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5860
5861 for (int i = 0; i < Size; ++i) {
5862 int M = Mask[i];
5863 // Handle the easy cases.
5864 if (M < 0) {
5865 KnownUndef.setBit(i);
5866 continue;
5867 }
5868 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5869 KnownZero.setBit(i);
5870 continue;
5871 }
5872
5873 // Determine shuffle input and normalize the mask.
5874 SDValue V = M < Size ? V1 : V2;
5875 M %= Size;
5876
5877 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5878 if (V.getOpcode() != ISD::BUILD_VECTOR)
5879 continue;
5880
5881 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5882 // the (larger) source element must be UNDEF/ZERO.
5883 if ((Size % V.getNumOperands()) == 0) {
5884 int Scale = Size / V->getNumOperands();
5885 SDValue Op = V.getOperand(M / Scale);
5886 if (Op.isUndef())
5887 KnownUndef.setBit(i);
5888 if (X86::isZeroNode(Op))
5889 KnownZero.setBit(i);
5890 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5891 APInt Val = Cst->getAPIntValue();
5892 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5893 if (Val == 0)
5894 KnownZero.setBit(i);
5895 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5896 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5897 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5898 if (Val == 0)
5899 KnownZero.setBit(i);
5900 }
5901 continue;
5902 }
5903
5904 // If the BUILD_VECTOR has more elements then all the (smaller) source
5905 // elements must be UNDEF or ZERO.
5906 if ((V.getNumOperands() % Size) == 0) {
5907 int Scale = V->getNumOperands() / Size;
5908 bool AllUndef = true;
5909 bool AllZero = true;
5910 for (int j = 0; j < Scale; ++j) {
5911 SDValue Op = V.getOperand((M * Scale) + j);
5912 AllUndef &= Op.isUndef();
5913 AllZero &= X86::isZeroNode(Op);
5914 }
5915 if (AllUndef)
5916 KnownUndef.setBit(i);
5917 if (AllZero)
5918 KnownZero.setBit(i);
5919 continue;
5920 }
5921 }
5922}
5923
5924/// Decode a target shuffle mask and inputs and see if any values are
5925/// known to be undef or zero from their inputs.
5926/// Returns true if the target shuffle mask was decoded.
5927/// FIXME: Merge this with computeZeroableShuffleElements?
5930 APInt &KnownUndef, APInt &KnownZero) {
5931 bool IsUnary;
5932 if (!isTargetShuffle(N.getOpcode()))
5933 return false;
5934
5935 MVT VT = N.getSimpleValueType();
5936 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5937 return false;
5938
5939 int Size = Mask.size();
5940 SDValue V1 = Ops[0];
5941 SDValue V2 = IsUnary ? V1 : Ops[1];
5942 KnownUndef = KnownZero = APInt::getZero(Size);
5943
5944 V1 = peekThroughBitcasts(V1);
5945 V2 = peekThroughBitcasts(V2);
5946
5947 assert((VT.getSizeInBits() % Size) == 0 &&
5948 "Illegal split of shuffle value type");
5949 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5950
5951 // Extract known constant input data.
5952 APInt UndefSrcElts[2];
5953 SmallVector<APInt, 32> SrcEltBits[2];
5954 bool IsSrcConstant[2] = {
5955 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5956 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5957 /*AllowPartialUndefs*/ false),
5958 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5959 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5960 /*AllowPartialUndefs*/ false)};
5961
5962 for (int i = 0; i < Size; ++i) {
5963 int M = Mask[i];
5964
5965 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5966 if (M < 0) {
5967 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5968 if (SM_SentinelUndef == M)
5969 KnownUndef.setBit(i);
5970 if (SM_SentinelZero == M)
5971 KnownZero.setBit(i);
5972 continue;
5973 }
5974
5975 // Determine shuffle input and normalize the mask.
5976 unsigned SrcIdx = M / Size;
5977 SDValue V = M < Size ? V1 : V2;
5978 M %= Size;
5979
5980 // We are referencing an UNDEF input.
5981 if (V.isUndef()) {
5982 KnownUndef.setBit(i);
5983 continue;
5984 }
5985
5986 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5987 // TODO: We currently only set UNDEF for integer types - floats use the same
5988 // registers as vectors and many of the scalar folded loads rely on the
5989 // SCALAR_TO_VECTOR pattern.
5990 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5991 (Size % V.getValueType().getVectorNumElements()) == 0) {
5992 int Scale = Size / V.getValueType().getVectorNumElements();
5993 int Idx = M / Scale;
5994 if (Idx != 0 && !VT.isFloatingPoint())
5995 KnownUndef.setBit(i);
5996 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5997 KnownZero.setBit(i);
5998 continue;
5999 }
6000
6001 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6002 // base vectors.
6003 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6004 SDValue Vec = V.getOperand(0);
6005 int NumVecElts = Vec.getValueType().getVectorNumElements();
6006 if (Vec.isUndef() && Size == NumVecElts) {
6007 int Idx = V.getConstantOperandVal(2);
6008 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6009 if (M < Idx || (Idx + NumSubElts) <= M)
6010 KnownUndef.setBit(i);
6011 }
6012 continue;
6013 }
6014
6015 // Attempt to extract from the source's constant bits.
6016 if (IsSrcConstant[SrcIdx]) {
6017 if (UndefSrcElts[SrcIdx][M])
6018 KnownUndef.setBit(i);
6019 else if (SrcEltBits[SrcIdx][M] == 0)
6020 KnownZero.setBit(i);
6021 }
6022 }
6023
6024 assert(VT.getVectorNumElements() == (unsigned)Size &&
6025 "Different mask size from vector size!");
6026 return true;
6027}
6028
6029// Replace target shuffle mask elements with known undef/zero sentinels.
6031 const APInt &KnownUndef,
6032 const APInt &KnownZero,
6033 bool ResolveKnownZeros= true) {
6034 unsigned NumElts = Mask.size();
6035 assert(KnownUndef.getBitWidth() == NumElts &&
6036 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6037
6038 for (unsigned i = 0; i != NumElts; ++i) {
6039 if (KnownUndef[i])
6040 Mask[i] = SM_SentinelUndef;
6041 else if (ResolveKnownZeros && KnownZero[i])
6042 Mask[i] = SM_SentinelZero;
6043 }
6044}
6045
6046// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6048 APInt &KnownUndef,
6049 APInt &KnownZero) {
6050 unsigned NumElts = Mask.size();
6051 KnownUndef = KnownZero = APInt::getZero(NumElts);
6052
6053 for (unsigned i = 0; i != NumElts; ++i) {
6054 int M = Mask[i];
6055 if (SM_SentinelUndef == M)
6056 KnownUndef.setBit(i);
6057 if (SM_SentinelZero == M)
6058 KnownZero.setBit(i);
6059 }
6060}
6061
6062// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6064 SDValue Cond, bool IsBLENDV = false) {
6065 EVT CondVT = Cond.getValueType();
6066 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6067 unsigned NumElts = CondVT.getVectorNumElements();
6068
6069 APInt UndefElts;
6070 SmallVector<APInt, 32> EltBits;
6071 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6072 /*AllowWholeUndefs*/ true,
6073 /*AllowPartialUndefs*/ false))
6074 return false;
6075
6076 Mask.resize(NumElts, SM_SentinelUndef);
6077
6078 for (int i = 0; i != (int)NumElts; ++i) {
6079 Mask[i] = i;
6080 // Arbitrarily choose from the 2nd operand if the select condition element
6081 // is undef.
6082 // TODO: Can we do better by matching patterns such as even/odd?
6083 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6084 (IsBLENDV && EltBits[i].isNonNegative()))
6085 Mask[i] += NumElts;
6086 }
6087
6088 return true;
6089}
6090
6091// Forward declaration (for getFauxShuffleMask recursive check).
6092static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6095 const SelectionDAG &DAG, unsigned Depth,
6096 bool ResolveKnownElts);
6097
6098// Attempt to decode ops that could be represented as a shuffle mask.
6099// The decoded shuffle mask may contain a different number of elements to the
6100// destination value type.
6101// TODO: Merge into getTargetShuffleInputs()
6102static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6105 const SelectionDAG &DAG, unsigned Depth,
6106 bool ResolveKnownElts) {
6107 Mask.clear();
6108 Ops.clear();
6109
6110 MVT VT = N.getSimpleValueType();
6111 unsigned NumElts = VT.getVectorNumElements();
6112 unsigned NumSizeInBits = VT.getSizeInBits();
6113 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6114 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6115 return false;
6116 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6117 unsigned NumSizeInBytes = NumSizeInBits / 8;
6118 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6119
6120 unsigned Opcode = N.getOpcode();
6121 switch (Opcode) {
6122 case ISD::VECTOR_SHUFFLE: {
6123 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6124 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6125 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6126 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6127 Ops.push_back(N.getOperand(0));
6128 Ops.push_back(N.getOperand(1));
6129 return true;
6130 }
6131 return false;
6132 }
6133 case ISD::AND:
6134 case X86ISD::ANDNP: {
6135 // Attempt to decode as a per-byte mask.
6136 APInt UndefElts;
6137 SmallVector<APInt, 32> EltBits;
6138 SDValue N0 = N.getOperand(0);
6139 SDValue N1 = N.getOperand(1);
6140 bool IsAndN = (X86ISD::ANDNP == Opcode);
6141 uint64_t ZeroMask = IsAndN ? 255 : 0;
6142 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6143 /*AllowWholeUndefs*/ false,
6144 /*AllowPartialUndefs*/ false))
6145 return false;
6146 // We can't assume an undef src element gives an undef dst - the other src
6147 // might be zero.
6148 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6149 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6150 const APInt &ByteBits = EltBits[i];
6151 if (ByteBits != 0 && ByteBits != 255)
6152 return false;
6153 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6154 }
6155 Ops.push_back(IsAndN ? N1 : N0);
6156 return true;
6157 }
6158 case ISD::OR: {
6159 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6160 // is a valid shuffle index.
6161 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6162 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6163 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6164 return false;
6165
6166 SmallVector<int, 64> SrcMask0, SrcMask1;
6167 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6170 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6171 Depth + 1, true) ||
6172 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6173 Depth + 1, true))
6174 return false;
6175
6176 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6177 SmallVector<int, 64> Mask0, Mask1;
6178 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6179 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6180 for (int i = 0; i != (int)MaskSize; ++i) {
6181 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6182 // loops converting between OR and BLEND shuffles due to
6183 // canWidenShuffleElements merging away undef elements, meaning we
6184 // fail to recognise the OR as the undef element isn't known zero.
6185 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6186 Mask.push_back(SM_SentinelZero);
6187 else if (Mask1[i] == SM_SentinelZero)
6188 Mask.push_back(i);
6189 else if (Mask0[i] == SM_SentinelZero)
6190 Mask.push_back(i + MaskSize);
6191 else
6192 return false;
6193 }
6194 Ops.push_back(N.getOperand(0));
6195 Ops.push_back(N.getOperand(1));
6196 return true;
6197 }
6198 case ISD::CONCAT_VECTORS: {
6199 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6200 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6201 if (NumBitsPerElt == 64) {
6202 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6203 for (unsigned M = 0; M != NumSubElts; ++M)
6204 Mask.push_back((I * NumElts) + M);
6205 Ops.push_back(N.getOperand(I));
6206 }
6207 return true;
6208 }
6209 return false;
6210 }
6211 case ISD::INSERT_SUBVECTOR: {
6212 SDValue Src = N.getOperand(0);
6213 SDValue Sub = N.getOperand(1);
6214 EVT SubVT = Sub.getValueType();
6215 unsigned NumSubElts = SubVT.getVectorNumElements();
6216 uint64_t InsertIdx = N.getConstantOperandVal(2);
6217 // Subvector isn't demanded - just return the base vector.
6218 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6219 Mask.resize(NumElts);
6220 std::iota(Mask.begin(), Mask.end(), 0);
6221 Ops.push_back(Src);
6222 return true;
6223 }
6224 // Handle CONCAT(SUB0, SUB1).
6225 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6226 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6227 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6228 Src.getOperand(0).isUndef() &&
6229 Src.getOperand(1).getValueType() == SubVT &&
6230 Src.getConstantOperandVal(2) == 0 &&
6231 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6232 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6235 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6236 Ops.push_back(Src.getOperand(1));
6237 Ops.push_back(Sub);
6238 return true;
6239 }
6240 if (!N->isOnlyUserOf(Sub.getNode()))
6241 return false;
6242
6243 SmallVector<int, 64> SubMask;
6244 SmallVector<SDValue, 2> SubInputs;
6246 EVT SubSrcVT = SubSrc.getValueType();
6247 if (!SubSrcVT.isVector())
6248 return false;
6249
6250 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6251 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6252 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6253 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6254 SDValue SubSrcSrc = SubSrc.getOperand(0);
6255 unsigned NumSubSrcSrcElts =
6256 SubSrcSrc.getValueType().getVectorNumElements();
6257 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6258 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6259 "Subvector valuetype mismatch");
6260 InsertIdx *= (MaxElts / NumElts);
6261 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6262 NumSubElts *= (MaxElts / NumElts);
6263 bool SrcIsUndef = Src.isUndef();
6264 for (int i = 0; i != (int)MaxElts; ++i)
6265 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6266 for (int i = 0; i != (int)NumSubElts; ++i)
6267 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6268 if (!SrcIsUndef)
6269 Ops.push_back(Src);
6270 Ops.push_back(SubSrcSrc);
6271 return true;
6272 }
6273
6274 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6275 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6276 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6277 Depth + 1, ResolveKnownElts))
6278 return false;
6279
6280 // Subvector shuffle inputs must not be larger than the subvector.
6281 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6282 return SubVT.getFixedSizeInBits() <
6283 SubInput.getValueSizeInBits().getFixedValue();
6284 }))
6285 return false;
6286
6287 if (SubMask.size() != NumSubElts) {
6288 assert(((SubMask.size() % NumSubElts) == 0 ||
6289 (NumSubElts % SubMask.size()) == 0) &&
6290 "Illegal submask scale");
6291 if ((NumSubElts % SubMask.size()) == 0) {
6292 int Scale = NumSubElts / SubMask.size();
6293 SmallVector<int, 64> ScaledSubMask;
6294 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6295 SubMask = ScaledSubMask;
6296 } else {
6297 int Scale = SubMask.size() / NumSubElts;
6298 NumSubElts = SubMask.size();
6299 NumElts *= Scale;
6300 InsertIdx *= Scale;
6301 }
6302 }
6303 Ops.push_back(Src);
6304 Ops.append(SubInputs.begin(), SubInputs.end());
6305 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6306 Mask.append(NumElts, SM_SentinelZero);
6307 else
6308 for (int i = 0; i != (int)NumElts; ++i)
6309 Mask.push_back(i);
6310 for (int i = 0; i != (int)NumSubElts; ++i) {
6311 int M = SubMask[i];
6312 if (0 <= M) {
6313 int InputIdx = M / NumSubElts;
6314 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6315 }
6316 Mask[i + InsertIdx] = M;
6317 }
6318 return true;
6319 }
6320 case X86ISD::PINSRB:
6321 case X86ISD::PINSRW:
6324 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6325 // vector, for matching src/dst vector types.
6326 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6327
6328 unsigned DstIdx = 0;
6329 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6330 // Check we have an in-range constant insertion index.
6331 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6332 N.getConstantOperandAPInt(2).uge(NumElts))
6333 return false;
6334 DstIdx = N.getConstantOperandVal(2);
6335
6336 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6337 if (X86::isZeroNode(Scl)) {
6338 Ops.push_back(N.getOperand(0));
6339 for (unsigned i = 0; i != NumElts; ++i)
6340 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6341 return true;
6342 }
6343 }
6344
6345 // Peek through trunc/aext/zext/bitcast.
6346 // TODO: aext shouldn't require SM_SentinelZero padding.
6347 // TODO: handle shift of scalars.
6348 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6349 while (Scl.getOpcode() == ISD::TRUNCATE ||
6350 Scl.getOpcode() == ISD::ANY_EXTEND ||
6351 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6352 (Scl.getOpcode() == ISD::BITCAST &&
6355 Scl = Scl.getOperand(0);
6356 MinBitsPerElt =
6357 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6358 }
6359 if ((MinBitsPerElt % 8) != 0)
6360 return false;
6361
6362 // Attempt to find the source vector the scalar was extracted from.
6363 SDValue SrcExtract;
6364 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6365 Scl.getOpcode() == X86ISD::PEXTRW ||
6366 Scl.getOpcode() == X86ISD::PEXTRB) &&
6367 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6368 SrcExtract = Scl;
6369 }
6370 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6371 return false;
6372
6373 SDValue SrcVec = SrcExtract.getOperand(0);
6374 EVT SrcVT = SrcVec.getValueType();
6375 if (!SrcVT.getScalarType().isByteSized())
6376 return false;
6377 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6378 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6379 unsigned DstByte = DstIdx * NumBytesPerElt;
6380 MinBitsPerElt =
6381 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6382
6383 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6384 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6385 Ops.push_back(SrcVec);
6386 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6387 } else {
6388 Ops.push_back(SrcVec);
6389 Ops.push_back(N.getOperand(0));
6390 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6391 Mask.push_back(NumSizeInBytes + i);
6392 }
6393
6394 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6395 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6396 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6397 Mask[DstByte + i] = SrcByte + i;
6398 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6399 Mask[DstByte + i] = SM_SentinelZero;
6400 return true;
6401 }
6402 case X86ISD::PACKSS:
6403 case X86ISD::PACKUS: {
6404 SDValue N0 = N.getOperand(0);
6405 SDValue N1 = N.getOperand(1);
6406 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6407 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6408 "Unexpected input value type");
6409
6410 APInt EltsLHS, EltsRHS;
6411 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6412
6413 // If we know input saturation won't happen (or we don't care for particular
6414 // lanes), we can treat this as a truncation shuffle.
6415 bool Offset0 = false, Offset1 = false;
6416 if (Opcode == X86ISD::PACKSS) {
6417 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6418 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6419 (!(N1.isUndef() || EltsRHS.isZero()) &&
6420 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6421 return false;
6422 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6423 // PACKSS then it was likely being used for sign-extension for a
6424 // truncation, so just peek through and adjust the mask accordingly.
6425 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6426 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6427 Offset0 = true;
6428 N0 = N0.getOperand(0);
6429 }
6430 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6431 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6432 Offset1 = true;
6433 N1 = N1.getOperand(0);
6434 }
6435 } else {
6436 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6437 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6438 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6439 (!(N1.isUndef() || EltsRHS.isZero()) &&
6440 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6441 return false;
6442 }
6443
6444 bool IsUnary = (N0 == N1);
6445
6446 Ops.push_back(N0);
6447 if (!IsUnary)
6448 Ops.push_back(N1);
6449
6450 createPackShuffleMask(VT, Mask, IsUnary);
6451
6452 if (Offset0 || Offset1) {
6453 for (int &M : Mask)
6454 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6455 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6456 ++M;
6457 }
6458 return true;
6459 }
6460 case ISD::VSELECT:
6461 case X86ISD::BLENDV: {
6462 SDValue Cond = N.getOperand(0);
6463 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6464 Ops.push_back(N.getOperand(1));
6465 Ops.push_back(N.getOperand(2));
6466 return true;
6467 }
6468 return false;
6469 }
6470 case X86ISD::VTRUNC: {
6471 SDValue Src = N.getOperand(0);
6472 EVT SrcVT = Src.getValueType();
6473 if (SrcVT.getSizeInBits() != NumSizeInBits)
6474 return false;
6475 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6476 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6477 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6478 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6479 for (unsigned i = 0; i != NumSrcElts; ++i)
6480 Mask.push_back(i * Scale);
6481 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6482 Ops.push_back(Src);
6483 return true;
6484 }
6485 case ISD::SHL:
6486 case ISD::SRL: {
6487 APInt UndefElts;
6488 SmallVector<APInt, 32> EltBits;
6489 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6490 UndefElts, EltBits,
6491 /*AllowWholeUndefs*/ true,
6492 /*AllowPartialUndefs*/ false))
6493 return false;
6494
6495 // We can only decode 'whole byte' bit shifts as shuffles.
6496 for (unsigned I = 0; I != NumElts; ++I)
6497 if (DemandedElts[I] && !UndefElts[I] &&
6498 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6499 return false;
6500
6501 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6502 Ops.push_back(N.getOperand(0));
6503
6504 for (unsigned I = 0; I != NumElts; ++I) {
6505 if (!DemandedElts[I] || UndefElts[I])
6506 continue;
6507 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6508 unsigned Lo = I * NumBytesPerElt;
6509 unsigned Hi = Lo + NumBytesPerElt;
6510 // Clear mask to all zeros and insert the shifted byte indices.
6511 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6512 if (ISD::SHL == Opcode)
6513 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6514 else
6515 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6516 Lo + ByteShift);
6517 }
6518 return true;
6519 }
6520 case X86ISD::VSHLI:
6521 case X86ISD::VSRLI: {
6522 uint64_t ShiftVal = N.getConstantOperandVal(1);
6523 // Out of range bit shifts are guaranteed to be zero.
6524 if (NumBitsPerElt <= ShiftVal) {
6525 Mask.append(NumElts, SM_SentinelZero);
6526 return true;
6527 }
6528
6529 // We can only decode 'whole byte' bit shifts as shuffles.
6530 if ((ShiftVal % 8) != 0)
6531 break;
6532
6533 uint64_t ByteShift = ShiftVal / 8;
6534 Ops.push_back(N.getOperand(0));
6535
6536 // Clear mask to all zeros and insert the shifted byte indices.
6537 Mask.append(NumSizeInBytes, SM_SentinelZero);
6538
6539 if (X86ISD::VSHLI == Opcode) {
6540 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6541 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6542 Mask[i + j] = i + j - ByteShift;
6543 } else {
6544 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6545 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6546 Mask[i + j - ByteShift] = i + j;
6547 }
6548 return true;
6549 }
6550 case X86ISD::VROTLI:
6551 case X86ISD::VROTRI: {
6552 // We can only decode 'whole byte' bit rotates as shuffles.
6553 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6554 if ((RotateVal % 8) != 0)
6555 return false;
6556 Ops.push_back(N.getOperand(0));
6557 int Offset = RotateVal / 8;
6558 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6559 for (int i = 0; i != (int)NumElts; ++i) {
6560 int BaseIdx = i * NumBytesPerElt;
6561 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6562 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6563 }
6564 }
6565 return true;
6566 }
6567 case X86ISD::VBROADCAST: {
6568 SDValue Src = N.getOperand(0);
6569 if (!Src.getSimpleValueType().isVector()) {
6570 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6571 !isNullConstant(Src.getOperand(1)) ||
6572 Src.getOperand(0).getValueType().getScalarType() !=
6573 VT.getScalarType())
6574 return false;
6575 Src = Src.getOperand(0);
6576 }
6577 Ops.push_back(Src);
6578 Mask.append(NumElts, 0);
6579 return true;
6580 }
6582 SDValue Src = N.getOperand(0);
6583 EVT SrcVT = Src.getValueType();
6584 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6585
6586 // Extended source must be a simple vector.
6587 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6588 (NumBitsPerSrcElt % 8) != 0)
6589 return false;
6590
6591 // We can only handle all-signbits extensions.
6592 APInt DemandedSrcElts =
6593 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6594 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6595 return false;
6596
6597 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6598 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6599 for (unsigned I = 0; I != NumElts; ++I)
6600 Mask.append(Scale, I);
6601 Ops.push_back(Src);
6602 return true;
6603 }
6604 case ISD::ZERO_EXTEND:
6605 case ISD::ANY_EXTEND:
6608 SDValue Src = N.getOperand(0);
6609 EVT SrcVT = Src.getValueType();
6610
6611 // Extended source must be a simple vector.
6612 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6613 (SrcVT.getScalarSizeInBits() % 8) != 0)
6614 return false;
6615
6616 bool IsAnyExtend =
6617 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6618 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6619 IsAnyExtend, Mask);
6620 Ops.push_back(Src);
6621 return true;
6622 }
6623 }
6624
6625 return false;
6626}
6627
6628/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6630 SmallVectorImpl<int> &Mask) {
6631 int MaskWidth = Mask.size();
6632 SmallVector<SDValue, 16> UsedInputs;
6633 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6634 int lo = UsedInputs.size() * MaskWidth;
6635 int hi = lo + MaskWidth;
6636
6637 // Strip UNDEF input usage.
6638 if (Inputs[i].isUndef())
6639 for (int &M : Mask)
6640 if ((lo <= M) && (M < hi))
6641 M = SM_SentinelUndef;
6642
6643 // Check for unused inputs.
6644 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6645 for (int &M : Mask)
6646 if (lo <= M)
6647 M -= MaskWidth;
6648 continue;
6649 }
6650
6651 // Check for repeated inputs.
6652 bool IsRepeat = false;
6653 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6654 if (UsedInputs[j] != Inputs[i])
6655 continue;
6656 for (int &M : Mask)
6657 if (lo <= M)
6658 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6659 IsRepeat = true;
6660 break;
6661 }
6662 if (IsRepeat)
6663 continue;
6664
6665 UsedInputs.push_back(Inputs[i]);
6666 }
6667 Inputs = UsedInputs;
6668}
6669
6670/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6671/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6672/// Returns true if the target shuffle mask was decoded.
6673static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6676 APInt &KnownUndef, APInt &KnownZero,
6677 const SelectionDAG &DAG, unsigned Depth,
6678 bool ResolveKnownElts) {
6680 return false; // Limit search depth.
6681
6682 EVT VT = Op.getValueType();
6683 if (!VT.isSimple() || !VT.isVector())
6684 return false;
6685
6686 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6687 if (ResolveKnownElts)
6688 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6689 return true;
6690 }
6691 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6692 ResolveKnownElts)) {
6693 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6694 return true;
6695 }
6696 return false;
6697}
6698
6699static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6702 const SelectionDAG &DAG, unsigned Depth,
6703 bool ResolveKnownElts) {
6704 APInt KnownUndef, KnownZero;
6705 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6706 KnownZero, DAG, Depth, ResolveKnownElts);
6707}
6708
6711 const SelectionDAG &DAG, unsigned Depth = 0,
6712 bool ResolveKnownElts = true) {
6713 EVT VT = Op.getValueType();
6714 if (!VT.isSimple() || !VT.isVector())
6715 return false;
6716
6717 unsigned NumElts = Op.getValueType().getVectorNumElements();
6718 APInt DemandedElts = APInt::getAllOnes(NumElts);
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6720 ResolveKnownElts);
6721}
6722
6723// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6724static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6725 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6726 SelectionDAG &DAG) {
6727 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6728 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6729 "Unknown broadcast load type");
6730
6731 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6732 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6733 return SDValue();
6734
6737 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6738 SDValue Ops[] = {Mem->getChain(), Ptr};
6739 SDValue BcstLd = DAG.getMemIntrinsicNode(
6740 Opcode, DL, Tys, Ops, MemVT,
6742 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6743 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6744 return BcstLd;
6745}
6746
6747/// Returns the scalar element that will make up the i'th
6748/// element of the result of the vector shuffle.
6749static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6750 SelectionDAG &DAG, unsigned Depth) {
6752 return SDValue(); // Limit search depth.
6753
6754 EVT VT = Op.getValueType();
6755 unsigned Opcode = Op.getOpcode();
6756 unsigned NumElems = VT.getVectorNumElements();
6757
6758 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6759 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6760 int Elt = SV->getMaskElt(Index);
6761
6762 if (Elt < 0)
6763 return DAG.getUNDEF(VT.getVectorElementType());
6764
6765 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6766 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6767 }
6768
6769 // Recurse into target specific vector shuffles to find scalars.
6770 if (isTargetShuffle(Opcode)) {
6771 MVT ShufVT = VT.getSimpleVT();
6772 MVT ShufSVT = ShufVT.getVectorElementType();
6773 int NumElems = (int)ShufVT.getVectorNumElements();
6774 SmallVector<int, 16> ShuffleMask;
6776 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6777 return SDValue();
6778
6779 int Elt = ShuffleMask[Index];
6780 if (Elt == SM_SentinelZero)
6781 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6782 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6783 if (Elt == SM_SentinelUndef)
6784 return DAG.getUNDEF(ShufSVT);
6785
6786 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6787 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6788 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6789 }
6790
6791 // Recurse into insert_subvector base/sub vector to find scalars.
6792 if (Opcode == ISD::INSERT_SUBVECTOR) {
6793 SDValue Vec = Op.getOperand(0);
6794 SDValue Sub = Op.getOperand(1);
6795 uint64_t SubIdx = Op.getConstantOperandVal(2);
6796 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6797
6798 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6799 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6800 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6801 }
6802
6803 // Recurse into concat_vectors sub vector to find scalars.
6804 if (Opcode == ISD::CONCAT_VECTORS) {
6805 EVT SubVT = Op.getOperand(0).getValueType();
6806 unsigned NumSubElts = SubVT.getVectorNumElements();
6807 uint64_t SubIdx = Index / NumSubElts;
6808 uint64_t SubElt = Index % NumSubElts;
6809 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6810 }
6811
6812 // Recurse into extract_subvector src vector to find scalars.
6813 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6814 SDValue Src = Op.getOperand(0);
6815 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6816 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6817 }
6818
6819 // We only peek through bitcasts of the same vector width.
6820 if (Opcode == ISD::BITCAST) {
6821 SDValue Src = Op.getOperand(0);
6822 EVT SrcVT = Src.getValueType();
6823 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6824 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6825 return SDValue();
6826 }
6827
6828 // Actual nodes that may contain scalar elements
6829
6830 // For insert_vector_elt - either return the index matching scalar or recurse
6831 // into the base vector.
6832 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6833 isa<ConstantSDNode>(Op.getOperand(2))) {
6834 if (Op.getConstantOperandAPInt(2) == Index)
6835 return Op.getOperand(1);
6836 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6837 }
6838
6839 if (Opcode == ISD::SCALAR_TO_VECTOR)
6840 return (Index == 0) ? Op.getOperand(0)
6841 : DAG.getUNDEF(VT.getVectorElementType());
6842
6843 if (Opcode == ISD::BUILD_VECTOR)
6844 return Op.getOperand(Index);
6845
6846 return SDValue();
6847}
6848
6849// Use PINSRB/PINSRW/PINSRD to create a build vector.
6851 const APInt &NonZeroMask,
6852 unsigned NumNonZero, unsigned NumZero,
6853 SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget) {
6855 MVT VT = Op.getSimpleValueType();
6856 unsigned NumElts = VT.getVectorNumElements();
6857 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6858 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6859 "Illegal vector insertion");
6860
6861 SDValue V;
6862 bool First = true;
6863
6864 for (unsigned i = 0; i < NumElts; ++i) {
6865 bool IsNonZero = NonZeroMask[i];
6866 if (!IsNonZero)
6867 continue;
6868
6869 // If the build vector contains zeros or our first insertion is not the
6870 // first index then insert into zero vector to break any register
6871 // dependency else use SCALAR_TO_VECTOR.
6872 if (First) {
6873 First = false;
6874 if (NumZero || 0 != i)
6875 V = getZeroVector(VT, Subtarget, DAG, DL);
6876 else {
6877 assert(0 == i && "Expected insertion into zero-index");
6878 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6879 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6880 V = DAG.getBitcast(VT, V);
6881 continue;
6882 }
6883 }
6884 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6885 DAG.getVectorIdxConstant(i, DL));
6886 }
6887
6888 return V;
6889}
6890
6891/// Custom lower build_vector of v16i8.
6893 const APInt &NonZeroMask,
6894 unsigned NumNonZero, unsigned NumZero,
6895 SelectionDAG &DAG,
6896 const X86Subtarget &Subtarget) {
6897 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6898 return SDValue();
6899
6900 // SSE4.1 - use PINSRB to insert each byte directly.
6901 if (Subtarget.hasSSE41())
6902 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6903 DAG, Subtarget);
6904
6905 SDValue V;
6906
6907 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6908 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6909 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6910 !NonZeroMask.extractBits(2, 2).isZero()) {
6911 for (unsigned I = 0; I != 4; ++I) {
6912 if (!NonZeroMask[I])
6913 continue;
6914 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6915 if (I != 0)
6916 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6917 DAG.getConstant(I * 8, DL, MVT::i8));
6918 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6919 }
6920 assert(V && "Failed to fold v16i8 vector to zero");
6921 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6922 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6923 V = DAG.getBitcast(MVT::v8i16, V);
6924 }
6925 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6926 bool ThisIsNonZero = NonZeroMask[i];
6927 bool NextIsNonZero = NonZeroMask[i + 1];
6928 if (!ThisIsNonZero && !NextIsNonZero)
6929 continue;
6930
6931 SDValue Elt;
6932 if (ThisIsNonZero) {
6933 if (NumZero || NextIsNonZero)
6934 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6935 else
6936 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6937 }
6938
6939 if (NextIsNonZero) {
6940 SDValue NextElt = Op.getOperand(i + 1);
6941 if (i == 0 && NumZero)
6942 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6943 else
6944 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6945 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6946 DAG.getConstant(8, DL, MVT::i8));
6947 if (ThisIsNonZero)
6948 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6949 else
6950 Elt = NextElt;
6951 }
6952
6953 // If our first insertion is not the first index or zeros are needed, then
6954 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6955 // elements undefined).
6956 if (!V) {
6957 if (i != 0 || NumZero)
6958 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6959 else {
6960 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6961 V = DAG.getBitcast(MVT::v8i16, V);
6962 continue;
6963 }
6964 }
6965 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6966 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6967 DAG.getVectorIdxConstant(i / 2, DL));
6968 }
6969
6970 return DAG.getBitcast(MVT::v16i8, V);
6971}
6972
6973/// Custom lower build_vector of v8i16.
6975 const APInt &NonZeroMask,
6976 unsigned NumNonZero, unsigned NumZero,
6977 SelectionDAG &DAG,
6978 const X86Subtarget &Subtarget) {
6979 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6980 return SDValue();
6981
6982 // Use PINSRW to insert each byte directly.
6983 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6984 Subtarget);
6985}
6986
6987/// Custom lower build_vector of v4i32 or v4f32.
6989 SelectionDAG &DAG,
6990 const X86Subtarget &Subtarget) {
6991 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6992 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6993 // Because we're creating a less complicated build vector here, we may enable
6994 // further folding of the MOVDDUP via shuffle transforms.
6995 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6996 Op.getOperand(0) == Op.getOperand(2) &&
6997 Op.getOperand(1) == Op.getOperand(3) &&
6998 Op.getOperand(0) != Op.getOperand(1)) {
6999 MVT VT = Op.getSimpleValueType();
7000 MVT EltVT = VT.getVectorElementType();
7001 // Create a new build vector with the first 2 elements followed by undef
7002 // padding, bitcast to v2f64, duplicate, and bitcast back.
7003 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7004 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7005 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7006 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7007 return DAG.getBitcast(VT, Dup);
7008 }
7009
7010 // Find all zeroable elements.
7011 std::bitset<4> Zeroable, Undefs;
7012 for (int i = 0; i < 4; ++i) {
7013 SDValue Elt = Op.getOperand(i);
7014 Undefs[i] = Elt.isUndef();
7015 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7016 }
7017 assert(Zeroable.size() - Zeroable.count() > 1 &&
7018 "We expect at least two non-zero elements!");
7019
7020 // We only know how to deal with build_vector nodes where elements are either
7021 // zeroable or extract_vector_elt with constant index.
7022 SDValue FirstNonZero;
7023 unsigned FirstNonZeroIdx;
7024 for (unsigned i = 0; i < 4; ++i) {
7025 if (Zeroable[i])
7026 continue;
7027 SDValue Elt = Op.getOperand(i);
7028 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7030 return SDValue();
7031 // Make sure that this node is extracting from a 128-bit vector.
7032 MVT VT = Elt.getOperand(0).getSimpleValueType();
7033 if (!VT.is128BitVector())
7034 return SDValue();
7035 if (!FirstNonZero.getNode()) {
7036 FirstNonZero = Elt;
7037 FirstNonZeroIdx = i;
7038 }
7039 }
7040
7041 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7042 SDValue V1 = FirstNonZero.getOperand(0);
7043 MVT VT = V1.getSimpleValueType();
7044
7045 // See if this build_vector can be lowered as a blend with zero.
7046 SDValue Elt;
7047 unsigned EltMaskIdx, EltIdx;
7048 int Mask[4];
7049 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7050 if (Zeroable[EltIdx]) {
7051 // The zero vector will be on the right hand side.
7052 Mask[EltIdx] = EltIdx+4;
7053 continue;
7054 }
7055
7056 Elt = Op->getOperand(EltIdx);
7057 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7058 EltMaskIdx = Elt.getConstantOperandVal(1);
7059 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7060 break;
7061 Mask[EltIdx] = EltIdx;
7062 }
7063
7064 if (EltIdx == 4) {
7065 // Let the shuffle legalizer deal with blend operations.
7066 SDValue VZeroOrUndef = (Zeroable == Undefs)
7067 ? DAG.getUNDEF(VT)
7068 : getZeroVector(VT, Subtarget, DAG, DL);
7069 if (V1.getSimpleValueType() != VT)
7070 V1 = DAG.getBitcast(VT, V1);
7071 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7072 }
7073
7074 // See if we can lower this build_vector to a INSERTPS.
7075 if (!Subtarget.hasSSE41())
7076 return SDValue();
7077
7078 SDValue V2 = Elt.getOperand(0);
7079 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7080 V1 = SDValue();
7081
7082 bool CanFold = true;
7083 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7084 if (Zeroable[i])
7085 continue;
7086
7087 SDValue Current = Op->getOperand(i);
7088 SDValue SrcVector = Current->getOperand(0);
7089 if (!V1.getNode())
7090 V1 = SrcVector;
7091 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7092 }
7093
7094 if (!CanFold)
7095 return SDValue();
7096
7097 assert(V1.getNode() && "Expected at least two non-zero elements!");
7098 if (V1.getSimpleValueType() != MVT::v4f32)
7099 V1 = DAG.getBitcast(MVT::v4f32, V1);
7100 if (V2.getSimpleValueType() != MVT::v4f32)
7101 V2 = DAG.getBitcast(MVT::v4f32, V2);
7102
7103 // Ok, we can emit an INSERTPS instruction.
7104 unsigned ZMask = Zeroable.to_ulong();
7105
7106 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7107 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7108 SDValue Result =
7109 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7110 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7111 return DAG.getBitcast(VT, Result);
7112}
7113
7114/// Return a vector logical shift node.
7115static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7116 SelectionDAG &DAG, const TargetLowering &TLI,
7117 const SDLoc &dl) {
7118 assert(VT.is128BitVector() && "Unknown type for VShift");
7119 MVT ShVT = MVT::v16i8;
7120 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7121 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7122 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7123 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7124 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7125}
7126
7128 SelectionDAG &DAG) {
7129
7130 // Check if the scalar load can be widened into a vector load. And if
7131 // the address is "base + cst" see if the cst can be "absorbed" into
7132 // the shuffle mask.
7134 SDValue Ptr = LD->getBasePtr();
7135 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7136 return SDValue();
7137 EVT PVT = LD->getValueType(0);
7138 if (PVT != MVT::i32 && PVT != MVT::f32)
7139 return SDValue();
7140
7141 int FI = -1;
7142 int64_t Offset = 0;
7144 FI = FINode->getIndex();
7145 Offset = 0;
7146 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7147 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7148 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7149 Offset = Ptr.getConstantOperandVal(1);
7150 Ptr = Ptr.getOperand(0);
7151 } else {
7152 return SDValue();
7153 }
7154
7155 // FIXME: 256-bit vector instructions don't require a strict alignment,
7156 // improve this code to support it better.
7157 Align RequiredAlign(VT.getSizeInBits() / 8);
7158 SDValue Chain = LD->getChain();
7159 // Make sure the stack object alignment is at least 16 or 32.
7161 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7162 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7163 if (MFI.isFixedObjectIndex(FI)) {
7164 // Can't change the alignment. FIXME: It's possible to compute
7165 // the exact stack offset and reference FI + adjust offset instead.
7166 // If someone *really* cares about this. That's the way to implement it.
7167 return SDValue();
7168 } else {
7169 MFI.setObjectAlignment(FI, RequiredAlign);
7170 }
7171 }
7172
7173 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7174 // Ptr + (Offset & ~15).
7175 if (Offset < 0)
7176 return SDValue();
7177 if ((Offset % RequiredAlign.value()) & 3)
7178 return SDValue();
7179 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7180 if (StartOffset) {
7181 SDLoc DL(Ptr);
7182 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7183 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7184 }
7185
7186 int EltNo = (Offset - StartOffset) >> 2;
7187 unsigned NumElems = VT.getVectorNumElements();
7188
7189 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7190 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7191 LD->getPointerInfo().getWithOffset(StartOffset));
7192
7193 SmallVector<int, 8> Mask(NumElems, EltNo);
7194
7195 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7196 }
7197
7198 return SDValue();
7199}
7200
7201// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7202static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7203 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7204 auto *BaseLd = cast<LoadSDNode>(Elt);
7205 if (!BaseLd->isSimple())
7206 return false;
7207 Ld = BaseLd;
7208 ByteOffset = 0;
7209 return true;
7210 }
7211
7212 switch (Elt.getOpcode()) {
7213 case ISD::BITCAST:
7214 case ISD::TRUNCATE:
7216 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7217 case ISD::SRL:
7218 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7219 uint64_t Amt = AmtC->getZExtValue();
7220 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7221 ByteOffset += Amt / 8;
7222 return true;
7223 }
7224 }
7225 break;
7227 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7228 SDValue Src = Elt.getOperand(0);
7229 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7230 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7231 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7232 findEltLoadSrc(Src, Ld, ByteOffset)) {
7233 uint64_t Idx = IdxC->getZExtValue();
7234 ByteOffset += Idx * (SrcSizeInBits / 8);
7235 return true;
7236 }
7237 }
7238 break;
7239 }
7240
7241 return false;
7242}
7243
7244/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7245/// elements can be replaced by a single large load which has the same value as
7246/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7247///
7248/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7250 const SDLoc &DL, SelectionDAG &DAG,
7251 const X86Subtarget &Subtarget,
7252 bool IsAfterLegalize) {
7253 if ((VT.getScalarSizeInBits() % 8) != 0)
7254 return SDValue();
7255
7256 unsigned NumElems = Elts.size();
7257
7258 int LastLoadedElt = -1;
7259 APInt LoadMask = APInt::getZero(NumElems);
7260 APInt ZeroMask = APInt::getZero(NumElems);
7261 APInt UndefMask = APInt::getZero(NumElems);
7262
7263 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7264 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7265
7266 // For each element in the initializer, see if we've found a load, zero or an
7267 // undef.
7268 for (unsigned i = 0; i < NumElems; ++i) {
7269 SDValue Elt = peekThroughBitcasts(Elts[i]);
7270 if (!Elt.getNode())
7271 return SDValue();
7272 if (Elt.isUndef()) {
7273 UndefMask.setBit(i);
7274 continue;
7275 }
7277 ZeroMask.setBit(i);
7278 continue;
7279 }
7280
7281 // Each loaded element must be the correct fractional portion of the
7282 // requested vector load.
7283 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7284 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7285 return SDValue();
7286
7287 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7288 return SDValue();
7289 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7290 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7291 return SDValue();
7292
7293 LoadMask.setBit(i);
7294 LastLoadedElt = i;
7295 }
7296 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7297 NumElems &&
7298 "Incomplete element masks");
7299
7300 // Handle Special Cases - all undef or undef/zero.
7301 if (UndefMask.popcount() == NumElems)
7302 return DAG.getUNDEF(VT);
7303 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7304 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7305 : DAG.getConstantFP(0.0, DL, VT);
7306
7307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7308 int FirstLoadedElt = LoadMask.countr_zero();
7309 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7310 EVT EltBaseVT = EltBase.getValueType();
7311 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7312 "Register/Memory size mismatch");
7313 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7314 assert(LDBase && "Did not find base load for merging consecutive loads");
7315 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7316 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7317 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7318 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7319 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7320
7321 // TODO: Support offsetting the base load.
7322 if (ByteOffsets[FirstLoadedElt] != 0)
7323 return SDValue();
7324
7325 // Check to see if the element's load is consecutive to the base load
7326 // or offset from a previous (already checked) load.
7327 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7328 LoadSDNode *Ld = Loads[EltIdx];
7329 int64_t ByteOffset = ByteOffsets[EltIdx];
7330 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7331 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7332 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7333 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7334 }
7335 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7336 EltIdx - FirstLoadedElt);
7337 };
7338
7339 // Consecutive loads can contain UNDEFS but not ZERO elements.
7340 // Consecutive loads with UNDEFs and ZEROs elements require a
7341 // an additional shuffle stage to clear the ZERO elements.
7342 bool IsConsecutiveLoad = true;
7343 bool IsConsecutiveLoadWithZeros = true;
7344 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7345 if (LoadMask[i]) {
7346 if (!CheckConsecutiveLoad(LDBase, i)) {
7347 IsConsecutiveLoad = false;
7348 IsConsecutiveLoadWithZeros = false;
7349 break;
7350 }
7351 } else if (ZeroMask[i]) {
7352 IsConsecutiveLoad = false;
7353 }
7354 }
7355
7356 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7357 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7358 assert(LDBase->isSimple() &&
7359 "Cannot merge volatile or atomic loads.");
7360 SDValue NewLd =
7361 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7362 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7363 for (auto *LD : Loads)
7364 if (LD)
7365 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7366 return NewLd;
7367 };
7368
7369 // Check if the base load is entirely dereferenceable.
7370 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7371 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7372
7373 // LOAD - all consecutive load/undefs (must start/end with a load or be
7374 // entirely dereferenceable). If we have found an entire vector of loads and
7375 // undefs, then return a large load of the entire vector width starting at the
7376 // base pointer. If the vector contains zeros, then attempt to shuffle those
7377 // elements.
7378 if (FirstLoadedElt == 0 &&
7379 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7380 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7381 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7382 return SDValue();
7383
7384 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7385 // will lower to regular temporal loads and use the cache.
7386 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7387 VT.is256BitVector() && !Subtarget.hasInt256())
7388 return SDValue();
7389
7390 if (NumElems == 1)
7391 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7392
7393 if (!ZeroMask)
7394 return CreateLoad(VT, LDBase);
7395
7396 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7397 // vector and a zero vector to clear out the zero elements.
7398 if (!IsAfterLegalize && VT.isVector()) {
7399 unsigned NumMaskElts = VT.getVectorNumElements();
7400 if ((NumMaskElts % NumElems) == 0) {
7401 unsigned Scale = NumMaskElts / NumElems;
7402 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7403 for (unsigned i = 0; i < NumElems; ++i) {
7404 if (UndefMask[i])
7405 continue;
7406 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7407 for (unsigned j = 0; j != Scale; ++j)
7408 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7409 }
7410 SDValue V = CreateLoad(VT, LDBase);
7411 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7412 : DAG.getConstantFP(0.0, DL, VT);
7413 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7414 }
7415 }
7416 }
7417
7418 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7419 if (VT.is256BitVector() || VT.is512BitVector()) {
7420 unsigned HalfNumElems = NumElems / 2;
7421 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7422 EVT HalfVT =
7423 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7424 SDValue HalfLD =
7425 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7426 DAG, Subtarget, IsAfterLegalize);
7427 if (HalfLD)
7428 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7429 HalfLD, DAG.getVectorIdxConstant(0, DL));
7430 }
7431 }
7432
7433 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7434 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7435 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7436 LoadSizeInBits == 64) &&
7437 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7438 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7439 : MVT::getIntegerVT(LoadSizeInBits);
7440 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7441 // Allow v4f32 on SSE1 only targets.
7442 // FIXME: Add more isel patterns so we can just use VT directly.
7443 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7444 VecVT = MVT::v4f32;
7445 if (TLI.isTypeLegal(VecVT)) {
7446 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7447 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7448 SDValue ResNode = DAG.getMemIntrinsicNode(
7449 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7451 for (auto *LD : Loads)
7452 if (LD)
7453 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7454 return DAG.getBitcast(VT, ResNode);
7455 }
7456 }
7457
7458 // BROADCAST - match the smallest possible repetition pattern, load that
7459 // scalar/subvector element and then broadcast to the entire vector.
7460 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7461 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7462 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7463 unsigned RepeatSize = SubElems * BaseSizeInBits;
7464 unsigned ScalarSize = std::min(RepeatSize, 64u);
7465 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7466 continue;
7467
7468 // Don't attempt a 1:N subvector broadcast - it should be caught by
7469 // combineConcatVectorOps, else will cause infinite loops.
7470 if (RepeatSize > ScalarSize && SubElems == 1)
7471 continue;
7472
7473 bool Match = true;
7474 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7475 for (unsigned i = 0; i != NumElems && Match; ++i) {
7476 if (!LoadMask[i])
7477 continue;
7478 SDValue Elt = peekThroughBitcasts(Elts[i]);
7479 if (RepeatedLoads[i % SubElems].isUndef())
7480 RepeatedLoads[i % SubElems] = Elt;
7481 else
7482 Match &= (RepeatedLoads[i % SubElems] == Elt);
7483 }
7484
7485 // We must have loads at both ends of the repetition.
7486 Match &= !RepeatedLoads.front().isUndef();
7487 Match &= !RepeatedLoads.back().isUndef();
7488 if (!Match)
7489 continue;
7490
7491 EVT RepeatVT =
7492 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7493 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7494 : EVT::getFloatingPointVT(ScalarSize);
7495 if (RepeatSize > ScalarSize)
7496 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7497 RepeatSize / ScalarSize);
7498 EVT BroadcastVT =
7499 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7500 VT.getSizeInBits() / ScalarSize);
7501 if (TLI.isTypeLegal(BroadcastVT)) {
7502 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7503 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7504 SDValue Broadcast = RepeatLoad;
7505 if (RepeatSize > ScalarSize) {
7506 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7507 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7508 } else {
7509 if (!Subtarget.hasAVX2() &&
7511 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7512 Subtarget,
7513 /*AssumeSingleUse=*/true))
7514 return SDValue();
7515 Broadcast =
7516 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7517 }
7518 return DAG.getBitcast(VT, Broadcast);
7519 }
7520 }
7521 }
7522 }
7523
7524 return SDValue();
7525}
7526
7527// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7528// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7529// are consecutive, non-overlapping, and in the right order.
7531 SelectionDAG &DAG,
7532 const X86Subtarget &Subtarget,
7533 bool IsAfterLegalize) {
7535 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7536 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7537 Elts.push_back(Elt);
7538 continue;
7539 }
7540 return SDValue();
7541 }
7542 assert(Elts.size() == VT.getVectorNumElements());
7543 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7544 IsAfterLegalize);
7545}
7546
7548 const APInt &Undefs, LLVMContext &C) {
7549 unsigned ScalarSize = VT.getScalarSizeInBits();
7550 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7551
7552 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7553 if (VT.isFloatingPoint()) {
7554 if (ScalarSize == 16)
7555 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7556 if (ScalarSize == 32)
7557 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7558 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7559 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7560 }
7561 return Constant::getIntegerValue(Ty, Val);
7562 };
7563
7564 SmallVector<Constant *, 32> ConstantVec;
7565 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7566 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7567 : getConstantScalar(Bits[I]));
7568
7569 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7570}
7571
7572static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7573 unsigned SplatBitSize, LLVMContext &C) {
7574 unsigned ScalarSize = VT.getScalarSizeInBits();
7575
7576 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7577 if (VT.isFloatingPoint()) {
7578 if (ScalarSize == 16)
7579 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7580 if (ScalarSize == 32)
7581 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7582 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7583 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7584 }
7585 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7586 };
7587
7588 if (ScalarSize == SplatBitSize)
7589 return getConstantScalar(SplatValue);
7590
7591 unsigned NumElm = SplatBitSize / ScalarSize;
7592 SmallVector<Constant *, 32> ConstantVec;
7593 for (unsigned I = 0; I != NumElm; ++I) {
7594 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7595 ConstantVec.push_back(getConstantScalar(Val));
7596 }
7597 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7598}
7599
7601 for (auto *U : N->users()) {
7602 unsigned Opc = U->getOpcode();
7603 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7604 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7605 return false;
7606 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7607 return false;
7608 if (isTargetShuffle(Opc))
7609 return true;
7610 if (Opc == ISD::BITCAST) // Ignore bitcasts
7611 return isFoldableUseOfShuffle(U);
7612 if (N->hasOneUse()) {
7613 // TODO, there may be some general way to know if a SDNode can
7614 // be folded. We now only know whether an MI is foldable.
7615 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7616 return false;
7617 return true;
7618 }
7619 }
7620 return false;
7621}
7622
7623// If the node has a single use by a VSELECT then AVX512 targets may be able to
7624// fold as a predicated instruction.
7625static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7626 unsigned SizeInBits = V.getValueSizeInBits();
7627 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7628 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7629 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7630 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637/// Attempt to use the vbroadcast instruction to generate a splat value
7638/// from a splat BUILD_VECTOR which uses:
7639/// a. A single scalar load, or a constant.
7640/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7641///
7642/// The VBROADCAST node is returned when a pattern is found,
7643/// or SDValue() otherwise.
7645 const SDLoc &dl,
7646 const X86Subtarget &Subtarget,
7647 SelectionDAG &DAG) {
7648 // VBROADCAST requires AVX.
7649 // TODO: Splats could be generated for non-AVX CPUs using SSE
7650 // instructions, but there's less potential gain for only 128-bit vectors.
7651 if (!Subtarget.hasAVX())
7652 return SDValue();
7653
7654 MVT VT = BVOp->getSimpleValueType(0);
7655 unsigned NumElts = VT.getVectorNumElements();
7656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7657 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7658 "Unsupported vector type for broadcast.");
7659
7660 // See if the build vector is a repeating sequence of scalars (inc. splat).
7661 SDValue Ld;
7662 BitVector UndefElements;
7663 SmallVector<SDValue, 16> Sequence;
7664 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7665 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7666 if (Sequence.size() == 1)
7667 Ld = Sequence[0];
7668 }
7669
7670 // Attempt to use VBROADCASTM
7671 // From this pattern:
7672 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7673 // b. t1 = (build_vector t0 t0)
7674 //
7675 // Create (VBROADCASTM v2i1 X)
7676 if (!Sequence.empty() && Subtarget.hasCDI()) {
7677 // If not a splat, are the upper sequence values zeroable?
7678 unsigned SeqLen = Sequence.size();
7679 bool UpperZeroOrUndef =
7680 SeqLen == 1 ||
7681 llvm::all_of(ArrayRef(Sequence).drop_front(),
7682 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7683 SDValue Op0 = Sequence[0];
7684 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7685 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7686 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7687 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7688 ? Op0.getOperand(0)
7689 : Op0.getOperand(0).getOperand(0);
7690 MVT MaskVT = BOperand.getSimpleValueType();
7691 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7692 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7693 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7694 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7695 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7696 unsigned Scale = 512 / VT.getSizeInBits();
7697 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7698 }
7699 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7700 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7701 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7702 return DAG.getBitcast(VT, Bcst);
7703 }
7704 }
7705 }
7706
7707 unsigned NumUndefElts = UndefElements.count();
7708 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7709 APInt SplatValue, Undef;
7710 unsigned SplatBitSize;
7711 bool HasUndef;
7712 // Check if this is a repeated constant pattern suitable for broadcasting.
7713 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7714 SplatBitSize > VT.getScalarSizeInBits() &&
7715 SplatBitSize < VT.getSizeInBits()) {
7716 // Avoid replacing with broadcast when it's a use of a shuffle
7717 // instruction to preserve the present custom lowering of shuffles.
7718 if (isFoldableUseOfShuffle(BVOp))
7719 return SDValue();
7720 // replace BUILD_VECTOR with broadcast of the repeated constants.
7721 LLVMContext *Ctx = DAG.getContext();
7722 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7723 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7724 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7725 // Load the constant scalar/subvector and broadcast it.
7726 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7727 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7728 SDValue CP = DAG.getConstantPool(C, PVT);
7729 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7730
7731 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7732 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7733 SDValue Ops[] = {DAG.getEntryNode(), CP};
7734 MachinePointerInfo MPI =
7736 SDValue Brdcst =
7738 MPI, Alignment, MachineMemOperand::MOLoad);
7739 return DAG.getBitcast(VT, Brdcst);
7740 }
7741 if (SplatBitSize > 64) {
7742 // Load the vector of constants and broadcast it.
7743 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7744 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7745 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7746 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7747 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7748 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7749 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7750 MachinePointerInfo MPI =
7753 Ops, VVT, MPI, Alignment,
7755 }
7756 }
7757
7758 // If we are moving a scalar into a vector (Ld must be set and all elements
7759 // but 1 are undef) and that operation is not obviously supported by
7760 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7761 // That's better than general shuffling and may eliminate a load to GPR and
7762 // move from scalar to vector register.
7763 if (!Ld || NumElts - NumUndefElts != 1)
7764 return SDValue();
7765 unsigned ScalarSize = Ld.getValueSizeInBits();
7766 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7767 return SDValue();
7768 }
7769
7770 bool ConstSplatVal =
7771 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7772 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7773
7774 // TODO: Handle broadcasts of non-constant sequences.
7775
7776 // Make sure that all of the users of a non-constant load are from the
7777 // BUILD_VECTOR node.
7778 // FIXME: Is the use count needed for non-constant, non-load case?
7779 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7780 return SDValue();
7781
7782 unsigned ScalarSize = Ld.getValueSizeInBits();
7783 bool IsGE256 = (VT.getSizeInBits() >= 256);
7784
7785 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7786 // instruction to save 8 or more bytes of constant pool data.
7787 // TODO: If multiple splats are generated to load the same constant,
7788 // it may be detrimental to overall size. There needs to be a way to detect
7789 // that condition to know if this is truly a size win.
7790 bool OptForSize = DAG.shouldOptForSize();
7791
7792 // Handle broadcasting a single constant scalar from the constant pool
7793 // into a vector.
7794 // On Sandybridge (no AVX2), it is still better to load a constant vector
7795 // from the constant pool and not to broadcast it from a scalar.
7796 // But override that restriction when optimizing for size.
7797 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7798 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7799 EVT CVT = Ld.getValueType();
7800 assert(!CVT.isVector() && "Must not broadcast a vector type");
7801
7802 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7803 // For size optimization, also splat v2f64 and v2i64, and for size opt
7804 // with AVX2, also splat i8 and i16.
7805 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7806 if (ScalarSize == 32 ||
7807 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7808 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7809 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7810 const Constant *C = nullptr;
7812 C = CI->getConstantIntValue();
7814 C = CF->getConstantFPValue();
7815
7816 assert(C && "Invalid constant type");
7817
7818 SDValue CP =
7820 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7821
7822 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7823 SDValue Ops[] = {DAG.getEntryNode(), CP};
7824 MachinePointerInfo MPI =
7826 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7827 MPI, Alignment, MachineMemOperand::MOLoad);
7828 }
7829 }
7830
7831 // Handle AVX2 in-register broadcasts.
7832 if (!IsLoad && Subtarget.hasInt256() &&
7833 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7834 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7835
7836 // The scalar source must be a normal load.
7837 if (!IsLoad)
7838 return SDValue();
7839
7840 // Make sure the non-chain result is only used by this build vector.
7841 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7842 return SDValue();
7843
7844 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7845 (Subtarget.hasVLX() && ScalarSize == 64)) {
7846 auto *LN = cast<LoadSDNode>(Ld);
7847 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7848 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7849 SDValue BCast =
7851 LN->getMemoryVT(), LN->getMemOperand());
7852 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7853 return BCast;
7854 }
7855
7856 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7857 // double since there is no vbroadcastsd xmm
7858 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7859 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7871 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7872
7873 // Unsupported broadcast.
7874 return SDValue();
7875}
7876
7877/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7878/// underlying vector and index.
7879///
7880/// Modifies \p ExtractedFromVec to the real vector and returns the real
7881/// index.
7882static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7883 SDValue ExtIdx) {
7884 int Idx = ExtIdx->getAsZExtVal();
7885 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7886 return Idx;
7887
7888 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7889 // lowered this:
7890 // (extract_vector_elt (v8f32 %1), Constant<6>)
7891 // to:
7892 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7893 // (extract_subvector (v8f32 %0), Constant<4>),
7894 // undef)
7895 // Constant<0>)
7896 // In this case the vector is the extract_subvector expression and the index
7897 // is 2, as specified by the shuffle.
7898 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7899 SDValue ShuffleVec = SVOp->getOperand(0);
7900 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7901 assert(ShuffleVecVT.getVectorElementType() ==
7902 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7903
7904 int ShuffleIdx = SVOp->getMaskElt(Idx);
7905 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7906 ExtractedFromVec = ShuffleVec;
7907 return ShuffleIdx;
7908 }
7909 return Idx;
7910}
7911
7913 SelectionDAG &DAG) {
7914 MVT VT = Op.getSimpleValueType();
7915
7916 // Skip if insert_vec_elt is not supported.
7917 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7919 return SDValue();
7920
7921 unsigned NumElems = Op.getNumOperands();
7922 SDValue VecIn1;
7923 SDValue VecIn2;
7924 SmallVector<unsigned, 4> InsertIndices;
7925 SmallVector<int, 8> Mask(NumElems, -1);
7926
7927 for (unsigned i = 0; i != NumElems; ++i) {
7928 unsigned Opc = Op.getOperand(i).getOpcode();
7929
7930 if (Opc == ISD::UNDEF)
7931 continue;
7932
7934 // Quit if more than 1 elements need inserting.
7935 if (InsertIndices.size() > 1)
7936 return SDValue();
7937
7938 InsertIndices.push_back(i);
7939 continue;
7940 }
7941
7942 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7943 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7944
7945 // Quit if non-constant index.
7946 if (!isa<ConstantSDNode>(ExtIdx))
7947 return SDValue();
7948 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7949
7950 // Quit if extracted from vector of different type.
7951 if (ExtractedFromVec.getValueType() != VT)
7952 return SDValue();
7953
7954 if (!VecIn1.getNode())
7955 VecIn1 = ExtractedFromVec;
7956 else if (VecIn1 != ExtractedFromVec) {
7957 if (!VecIn2.getNode())
7958 VecIn2 = ExtractedFromVec;
7959 else if (VecIn2 != ExtractedFromVec)
7960 // Quit if more than 2 vectors to shuffle
7961 return SDValue();
7962 }
7963
7964 if (ExtractedFromVec == VecIn1)
7965 Mask[i] = Idx;
7966 else if (ExtractedFromVec == VecIn2)
7967 Mask[i] = Idx + NumElems;
7968 }
7969
7970 if (!VecIn1.getNode())
7971 return SDValue();
7972
7973 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7974 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7975
7976 for (unsigned Idx : InsertIndices)
7977 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7978 DAG.getVectorIdxConstant(Idx, DL));
7979
7980 return NV;
7981}
7982
7983// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7985 const X86Subtarget &Subtarget) {
7986 MVT VT = Op.getSimpleValueType();
7987 MVT IVT =
7988 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7990 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7991 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7992 Op.getOperand(I)));
7993 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7994 return DAG.getBitcast(VT, Res);
7995}
7996
7997// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7999 SelectionDAG &DAG,
8000 const X86Subtarget &Subtarget) {
8001
8002 MVT VT = Op.getSimpleValueType();
8003 assert((VT.getVectorElementType() == MVT::i1) &&
8004 "Unexpected type in LowerBUILD_VECTORvXi1!");
8005 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8006 ISD::isBuildVectorAllOnes(Op.getNode()))
8007 return Op;
8008
8009 uint64_t Immediate = 0;
8010 SmallVector<unsigned, 16> NonConstIdx;
8011 bool IsSplat = true;
8012 bool HasConstElts = false;
8013 int SplatIdx = -1;
8014 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8015 SDValue In = Op.getOperand(idx);
8016 if (In.isUndef())
8017 continue;
8018 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8019 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8020 HasConstElts = true;
8021 } else {
8022 NonConstIdx.push_back(idx);
8023 }
8024 if (SplatIdx < 0)
8025 SplatIdx = idx;
8026 else if (In != Op.getOperand(SplatIdx))
8027 IsSplat = false;
8028 }
8029
8030 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8031 if (IsSplat) {
8032 // The build_vector allows the scalar element to be larger than the vector
8033 // element type. We need to mask it to use as a condition unless we know
8034 // the upper bits are zero.
8035 // FIXME: Use computeKnownBits instead of checking specific opcode?
8036 SDValue Cond = Op.getOperand(SplatIdx);
8037 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8038 if (Cond.getOpcode() != ISD::SETCC)
8039 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8040 DAG.getConstant(1, dl, MVT::i8));
8041
8042 // Perform the select in the scalar domain so we can use cmov.
8043 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8044 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8045 DAG.getAllOnesConstant(dl, MVT::i32),
8046 DAG.getConstant(0, dl, MVT::i32));
8047 Select = DAG.getBitcast(MVT::v32i1, Select);
8048 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8049 } else {
8050 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8051 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8052 DAG.getAllOnesConstant(dl, ImmVT),
8053 DAG.getConstant(0, dl, ImmVT));
8054 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8055 Select = DAG.getBitcast(VecVT, Select);
8056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8057 DAG.getVectorIdxConstant(0, dl));
8058 }
8059 }
8060
8061 // insert elements one by one
8062 SDValue DstVec;
8063 if (HasConstElts) {
8064 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8065 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8066 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8067 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8068 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8069 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8070 } else {
8071 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8072 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8073 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8074 DstVec = DAG.getBitcast(VecVT, Imm);
8075 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8076 DAG.getVectorIdxConstant(0, dl));
8077 }
8078 } else
8079 DstVec = DAG.getUNDEF(VT);
8080
8081 for (unsigned InsertIdx : NonConstIdx) {
8082 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8083 Op.getOperand(InsertIdx),
8084 DAG.getVectorIdxConstant(InsertIdx, dl));
8085 }
8086 return DstVec;
8087}
8088
8089LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8090 switch (Opcode) {
8091 case X86ISD::PACKSS:
8092 case X86ISD::PACKUS:
8093 case X86ISD::FHADD:
8094 case X86ISD::FHSUB:
8095 case X86ISD::HADD:
8096 case X86ISD::HSUB:
8097 return true;
8098 }
8099 return false;
8100}
8101
8102/// This is a helper function of LowerToHorizontalOp().
8103/// This function checks that the build_vector \p N in input implements a
8104/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8105/// may not match the layout of an x86 256-bit horizontal instruction.
8106/// In other words, if this returns true, then some extraction/insertion will
8107/// be required to produce a valid horizontal instruction.
8108///
8109/// Parameter \p Opcode defines the kind of horizontal operation to match.
8110/// For example, if \p Opcode is equal to ISD::ADD, then this function
8111/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8112/// is equal to ISD::SUB, then this function checks if this is a horizontal
8113/// arithmetic sub.
8114///
8115/// This function only analyzes elements of \p N whose indices are
8116/// in range [BaseIdx, LastIdx).
8117///
8118/// TODO: This function was originally used to match both real and fake partial
8119/// horizontal operations, but the index-matching logic is incorrect for that.
8120/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8121/// code because it is only used for partial h-op matching now?
8122static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8123 const SDLoc &DL, SelectionDAG &DAG,
8124 unsigned BaseIdx, unsigned LastIdx,
8125 SDValue &V0, SDValue &V1) {
8126 EVT VT = N->getValueType(0);
8127 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8128 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8129 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8130 "Invalid Vector in input!");
8131
8132 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8133 bool CanFold = true;
8134 unsigned ExpectedVExtractIdx = BaseIdx;
8135 unsigned NumElts = LastIdx - BaseIdx;
8136 V0 = DAG.getUNDEF(VT);
8137 V1 = DAG.getUNDEF(VT);
8138
8139 // Check if N implements a horizontal binop.
8140 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8141 SDValue Op = N->getOperand(i + BaseIdx);
8142
8143 // Skip UNDEFs.
8144 if (Op->isUndef()) {
8145 // Update the expected vector extract index.
8146 if (i * 2 == NumElts)
8147 ExpectedVExtractIdx = BaseIdx;
8148 ExpectedVExtractIdx += 2;
8149 continue;
8150 }
8151
8152 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8153
8154 if (!CanFold)
8155 break;
8156
8157 SDValue Op0 = Op.getOperand(0);
8158 SDValue Op1 = Op.getOperand(1);
8159
8160 // Try to match the following pattern:
8161 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8162 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8164 Op0.getOperand(0) == Op1.getOperand(0) &&
8167 if (!CanFold)
8168 break;
8169
8170 unsigned I0 = Op0.getConstantOperandVal(1);
8171 unsigned I1 = Op1.getConstantOperandVal(1);
8172
8173 if (i * 2 < NumElts) {
8174 if (V0.isUndef()) {
8175 V0 = Op0.getOperand(0);
8176 if (V0.getValueType() != VT)
8177 return false;
8178 }
8179 } else {
8180 if (V1.isUndef()) {
8181 V1 = Op0.getOperand(0);
8182 if (V1.getValueType() != VT)
8183 return false;
8184 }
8185 if (i * 2 == NumElts)
8186 ExpectedVExtractIdx = BaseIdx;
8187 }
8188
8189 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8190 if (I0 == ExpectedVExtractIdx)
8191 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8192 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8193 // Try to match the following dag sequence:
8194 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8195 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8196 } else
8197 CanFold = false;
8198
8199 ExpectedVExtractIdx += 2;
8200 }
8201
8202 return CanFold;
8203}
8204
8205/// Emit a sequence of two 128-bit horizontal add/sub followed by
8206/// a concat_vector.
8207///
8208/// This is a helper function of LowerToHorizontalOp().
8209/// This function expects two 256-bit vectors called V0 and V1.
8210/// At first, each vector is split into two separate 128-bit vectors.
8211/// Then, the resulting 128-bit vectors are used to implement two
8212/// horizontal binary operations.
8213///
8214/// The kind of horizontal binary operation is defined by \p X86Opcode.
8215///
8216/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8217/// the two new horizontal binop.
8218/// When Mode is set, the first horizontal binop dag node would take as input
8219/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8220/// horizontal binop dag node would take as input the lower 128-bit of V1
8221/// and the upper 128-bit of V1.
8222/// Example:
8223/// HADD V0_LO, V0_HI
8224/// HADD V1_LO, V1_HI
8225///
8226/// Otherwise, the first horizontal binop dag node takes as input the lower
8227/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8228/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8229/// Example:
8230/// HADD V0_LO, V1_LO
8231/// HADD V0_HI, V1_HI
8232///
8233/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8234/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8235/// the upper 128-bits of the result.
8236static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8237 const SDLoc &DL, SelectionDAG &DAG,
8238 unsigned X86Opcode, bool Mode,
8239 bool isUndefLO, bool isUndefHI) {
8240 MVT VT = V0.getSimpleValueType();
8241 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8242 "Invalid nodes in input!");
8243
8244 unsigned NumElts = VT.getVectorNumElements();
8245 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8246 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8247 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8248 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8249 MVT NewVT = V0_LO.getSimpleValueType();
8250
8251 SDValue LO = DAG.getUNDEF(NewVT);
8252 SDValue HI = DAG.getUNDEF(NewVT);
8253
8254 if (Mode) {
8255 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8256 if (!isUndefLO && !V0->isUndef())
8257 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8258 if (!isUndefHI && !V1->isUndef())
8259 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8260 } else {
8261 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8262 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8263 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8264
8265 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8266 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8267 }
8268
8269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8270}
8271
8272/// Returns true iff \p BV builds a vector with the result equivalent to
8273/// the result of ADDSUB/SUBADD operation.
8274/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8275/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8276/// \p Opnd0 and \p Opnd1.
8278 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8279 SDValue &Opnd0, SDValue &Opnd1,
8280 unsigned &NumExtracts, bool &IsSubAdd,
8281 bool &HasAllowContract) {
8282 using namespace SDPatternMatch;
8283
8284 MVT VT = BV->getSimpleValueType(0);
8285 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8286 return false;
8287
8288 unsigned NumElts = VT.getVectorNumElements();
8289 SDValue InVec0 = DAG.getUNDEF(VT);
8290 SDValue InVec1 = DAG.getUNDEF(VT);
8291
8292 NumExtracts = 0;
8293 HasAllowContract = NumElts != 0;
8294
8295 // Odd-numbered elements in the input build vector are obtained from
8296 // adding/subtracting two integer/float elements.
8297 // Even-numbered elements in the input build vector are obtained from
8298 // subtracting/adding two integer/float elements.
8299 unsigned Opc[2] = {0, 0};
8300 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8301 SDValue Op = BV->getOperand(i);
8302
8303 // Skip 'undef' values.
8304 unsigned Opcode = Op.getOpcode();
8305 if (Opcode == ISD::UNDEF)
8306 continue;
8307
8308 // Early exit if we found an unexpected opcode.
8309 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8310 return false;
8311
8312 SDValue Op0 = Op.getOperand(0);
8313 SDValue Op1 = Op.getOperand(1);
8314
8315 // Try to match the following pattern:
8316 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8317 // Early exit if we cannot match that sequence.
8318 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8319 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8320 return false;
8321
8322 // We found a valid add/sub node, make sure its the same opcode as previous
8323 // elements for this parity.
8324 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8325 return false;
8326 Opc[i % 2] = Opcode;
8327
8328 // Update InVec0 and InVec1.
8329 if (InVec0.isUndef())
8330 InVec0 = Op0.getOperand(0);
8331 if (InVec1.isUndef())
8332 InVec1 = Op1.getOperand(0);
8333
8334 // Make sure that operands in input to each add/sub node always
8335 // come from a same pair of vectors.
8336 if (InVec0 != Op0.getOperand(0)) {
8337 if (Opcode == ISD::FSUB)
8338 return false;
8339
8340 // FADD is commutable. Try to commute the operands
8341 // and then test again.
8342 std::swap(Op0, Op1);
8343 if (InVec0 != Op0.getOperand(0))
8344 return false;
8345 }
8346
8347 if (InVec1 != Op1.getOperand(0))
8348 return false;
8349
8350 // Increment the number of extractions done.
8351 ++NumExtracts;
8352 HasAllowContract &= Op->getFlags().hasAllowContract();
8353 }
8354
8355 // Ensure we have found an opcode for both parities and that they are
8356 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8357 // inputs are undef.
8358 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8359 InVec0.isUndef() || InVec1.isUndef())
8360 return false;
8361
8362 IsSubAdd = Opc[0] == ISD::FADD;
8363
8364 Opnd0 = InVec0;
8365 Opnd1 = InVec1;
8366 return true;
8367}
8368
8369/// Returns true if is possible to fold MUL and an idiom that has already been
8370/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8371/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8372/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8373///
8374/// Prior to calling this function it should be known that there is some
8375/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8376/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8377/// before replacement of such SDNode with ADDSUB operation. Thus the number
8378/// of \p Opnd0 uses is expected to be equal to 2.
8379/// For example, this function may be called for the following IR:
8380/// %AB = fmul fast <2 x double> %A, %B
8381/// %Sub = fsub fast <2 x double> %AB, %C
8382/// %Add = fadd fast <2 x double> %AB, %C
8383/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8384/// <2 x i32> <i32 0, i32 3>
8385/// There is a def for %Addsub here, which potentially can be replaced by
8386/// X86ISD::ADDSUB operation:
8387/// %Addsub = X86ISD::ADDSUB %AB, %C
8388/// and such ADDSUB can further be replaced with FMADDSUB:
8389/// %Addsub = FMADDSUB %A, %B, %C.
8390///
8391/// The main reason why this method is called before the replacement of the
8392/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8393/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8394/// FMADDSUB is.
8395static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8396 SelectionDAG &DAG, SDValue &Opnd0,
8397 SDValue &Opnd1, SDValue &Opnd2,
8398 unsigned ExpectedUses,
8399 bool AllowSubAddOrAddSubContract) {
8400 if (Opnd0.getOpcode() != ISD::FMUL ||
8401 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8402 return false;
8403
8404 // FIXME: These checks must match the similar ones in
8405 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8406 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8407 // or MUL + ADDSUB to FMADDSUB.
8408 const TargetOptions &Options = DAG.getTarget().Options;
8409 bool AllowFusion =
8410 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8411 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8412 if (!AllowFusion)
8413 return false;
8414
8415 Opnd2 = Opnd1;
8416 Opnd1 = Opnd0.getOperand(1);
8417 Opnd0 = Opnd0.getOperand(0);
8418
8419 return true;
8420}
8421
8422/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8423/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8424/// X86ISD::FMSUBADD node.
8426 const SDLoc &DL,
8427 const X86Subtarget &Subtarget,
8428 SelectionDAG &DAG) {
8429 SDValue Opnd0, Opnd1;
8430 unsigned NumExtracts;
8431 bool IsSubAdd;
8432 bool HasAllowContract;
8433 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8434 HasAllowContract))
8435 return SDValue();
8436
8437 MVT VT = BV->getSimpleValueType(0);
8438
8439 // Try to generate X86ISD::FMADDSUB node here.
8440 SDValue Opnd2;
8441 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8442 HasAllowContract)) {
8443 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8444 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8445 }
8446
8447 // We only support ADDSUB.
8448 if (IsSubAdd)
8449 return SDValue();
8450
8451 // There are no known X86 targets with 512-bit ADDSUB instructions!
8452 // Convert to blend(fsub,fadd).
8453 if (VT.is512BitVector()) {
8454 SmallVector<int> Mask;
8455 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8456 Mask.push_back(I);
8457 Mask.push_back(I + E + 1);
8458 }
8459 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8460 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8461 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8462 }
8463
8464 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8465}
8466
8468 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8469 // Initialize outputs to known values.
8470 MVT VT = BV->getSimpleValueType(0);
8471 HOpcode = ISD::DELETED_NODE;
8472 V0 = DAG.getUNDEF(VT);
8473 V1 = DAG.getUNDEF(VT);
8474
8475 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8476 // half of the result is calculated independently from the 128-bit halves of
8477 // the inputs, so that makes the index-checking logic below more complicated.
8478 unsigned NumElts = VT.getVectorNumElements();
8479 unsigned GenericOpcode = ISD::DELETED_NODE;
8480 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8481 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8482 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8483 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8484 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8485 // Ignore undef elements.
8486 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8487 if (Op.isUndef())
8488 continue;
8489
8490 // If there's an opcode mismatch, we're done.
8491 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8492 return false;
8493
8494 // Initialize horizontal opcode.
8495 if (HOpcode == ISD::DELETED_NODE) {
8496 GenericOpcode = Op.getOpcode();
8497 switch (GenericOpcode) {
8498 // clang-format off
8499 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8500 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8501 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8502 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8503 default: return false;
8504 // clang-format on
8505 }
8506 }
8507
8508 SDValue Op0 = Op.getOperand(0);
8509 SDValue Op1 = Op.getOperand(1);
8510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8512 Op0.getOperand(0) != Op1.getOperand(0) ||
8514 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8515 return false;
8516
8517 // The source vector is chosen based on which 64-bit half of the
8518 // destination vector is being calculated.
8519 if (j < NumEltsIn64Bits) {
8520 if (V0.isUndef())
8521 V0 = Op0.getOperand(0);
8522 } else {
8523 if (V1.isUndef())
8524 V1 = Op0.getOperand(0);
8525 }
8526
8527 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8528 if (SourceVec != Op0.getOperand(0))
8529 return false;
8530
8531 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8532 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8533 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8534 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8535 (j % NumEltsIn64Bits) * 2;
8536 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8537 continue;
8538
8539 // If this is not a commutative op, this does not match.
8540 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8541 return false;
8542
8543 // Addition is commutative, so try swapping the extract indexes.
8544 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8545 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8546 continue;
8547
8548 // Extract indexes do not match horizontal requirement.
8549 return false;
8550 }
8551 }
8552 // We matched. Opcode and operands are returned by reference as arguments.
8553 return true;
8554}
8555
8557 const SDLoc &DL, SelectionDAG &DAG,
8558 unsigned HOpcode, SDValue V0, SDValue V1) {
8559 // If either input vector is not the same size as the build vector,
8560 // extract/insert the low bits to the correct size.
8561 // This is free (examples: zmm --> xmm, xmm --> ymm).
8562 MVT VT = BV->getSimpleValueType(0);
8563 unsigned Width = VT.getSizeInBits();
8564 if (V0.getValueSizeInBits() > Width)
8565 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8566 else if (V0.getValueSizeInBits() < Width)
8567 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8568
8569 if (V1.getValueSizeInBits() > Width)
8570 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8571 else if (V1.getValueSizeInBits() < Width)
8572 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8573
8574 unsigned NumElts = VT.getVectorNumElements();
8575 APInt DemandedElts = APInt::getAllOnes(NumElts);
8576 for (unsigned i = 0; i != NumElts; ++i)
8577 if (BV->getOperand(i).isUndef())
8578 DemandedElts.clearBit(i);
8579
8580 // If we don't need the upper xmm, then perform as a xmm hop.
8581 unsigned HalfNumElts = NumElts / 2;
8582 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8583 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8584 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8585 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8586 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8587 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8588 }
8589
8590 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8591}
8592
8593/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8595 const X86Subtarget &Subtarget,
8596 SelectionDAG &DAG) {
8597 // We need at least 2 non-undef elements to make this worthwhile by default.
8598 unsigned NumNonUndefs =
8599 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8600 if (NumNonUndefs < 2)
8601 return SDValue();
8602
8603 // There are 4 sets of horizontal math operations distinguished by type:
8604 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8605 // subtarget feature. Try to match those "native" patterns first.
8606 MVT VT = BV->getSimpleValueType(0);
8607 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8608 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8609 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8610 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8611 unsigned HOpcode;
8612 SDValue V0, V1;
8613 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8614 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8615 }
8616
8617 // Try harder to match 256-bit ops by using extract/concat.
8618 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8619 return SDValue();
8620
8621 // Count the number of UNDEF operands in the build_vector in input.
8622 unsigned NumElts = VT.getVectorNumElements();
8623 unsigned Half = NumElts / 2;
8624 unsigned NumUndefsLO = 0;
8625 unsigned NumUndefsHI = 0;
8626 for (unsigned i = 0, e = Half; i != e; ++i)
8627 if (BV->getOperand(i)->isUndef())
8628 NumUndefsLO++;
8629
8630 for (unsigned i = Half, e = NumElts; i != e; ++i)
8631 if (BV->getOperand(i)->isUndef())
8632 NumUndefsHI++;
8633
8634 SDValue InVec0, InVec1;
8635 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8636 SDValue InVec2, InVec3;
8637 unsigned X86Opcode;
8638 bool CanFold = true;
8639
8640 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8641 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8642 InVec3) &&
8643 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8644 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8645 X86Opcode = X86ISD::HADD;
8646 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8647 InVec1) &&
8648 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8649 InVec3) &&
8650 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8651 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8652 X86Opcode = X86ISD::HSUB;
8653 else
8654 CanFold = false;
8655
8656 if (CanFold) {
8657 // Do not try to expand this build_vector into a pair of horizontal
8658 // add/sub if we can emit a pair of scalar add/sub.
8659 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8660 return SDValue();
8661
8662 // Convert this build_vector into a pair of horizontal binops followed by
8663 // a concat vector. We must adjust the outputs from the partial horizontal
8664 // matching calls above to account for undefined vector halves.
8665 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8666 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8667 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8668 bool isUndefLO = NumUndefsLO == Half;
8669 bool isUndefHI = NumUndefsHI == Half;
8670 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8671 isUndefHI);
8672 }
8673 }
8674
8675 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8676 VT == MVT::v16i16) {
8677 unsigned X86Opcode;
8678 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8679 InVec1))
8680 X86Opcode = X86ISD::HADD;
8681 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8682 InVec1))
8683 X86Opcode = X86ISD::HSUB;
8684 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8685 InVec1))
8686 X86Opcode = X86ISD::FHADD;
8687 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8688 InVec1))
8689 X86Opcode = X86ISD::FHSUB;
8690 else
8691 return SDValue();
8692
8693 // Don't try to expand this build_vector into a pair of horizontal add/sub
8694 // if we can simply emit a pair of scalar add/sub.
8695 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8696 return SDValue();
8697
8698 // Convert this build_vector into two horizontal add/sub followed by
8699 // a concat vector.
8700 bool isUndefLO = NumUndefsLO == Half;
8701 bool isUndefHI = NumUndefsHI == Half;
8702 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8703 isUndefLO, isUndefHI);
8704 }
8705
8706 return SDValue();
8707}
8708
8709static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8710 SelectionDAG &DAG);
8711
8712/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8713/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8714/// just apply the bit to the vectors.
8715/// NOTE: Its not in our interest to start make a general purpose vectorizer
8716/// from this, but enough scalar bit operations are created from the later
8717/// legalization + scalarization stages to need basic support.
8719 const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG) {
8721 MVT VT = Op->getSimpleValueType(0);
8722 unsigned NumElems = VT.getVectorNumElements();
8723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8724
8725 // Check that all elements have the same opcode.
8726 // TODO: Should we allow UNDEFS and if so how many?
8727 unsigned Opcode = Op->getOperand(0).getOpcode();
8728 for (unsigned i = 1; i < NumElems; ++i)
8729 if (Opcode != Op->getOperand(i).getOpcode())
8730 return SDValue();
8731
8732 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8733 bool IsShift = false;
8734 switch (Opcode) {
8735 default:
8736 return SDValue();
8737 case ISD::SHL:
8738 case ISD::SRL:
8739 case ISD::SRA:
8740 IsShift = true;
8741 break;
8742 case ISD::AND:
8743 case ISD::XOR:
8744 case ISD::OR:
8745 // Don't do this if the buildvector is a splat - we'd replace one
8746 // constant with an entire vector.
8747 if (Op->getSplatValue())
8748 return SDValue();
8749 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8750 return SDValue();
8751 break;
8752 }
8753
8754 SmallVector<SDValue, 4> LHSElts, RHSElts;
8755 for (SDValue Elt : Op->ops()) {
8756 SDValue LHS = Elt.getOperand(0);
8757 SDValue RHS = Elt.getOperand(1);
8758
8759 // We expect the canonicalized RHS operand to be the constant.
8761 return SDValue();
8762
8763 // Extend shift amounts.
8764 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8765 if (!IsShift)
8766 return SDValue();
8767 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8768 }
8769
8770 LHSElts.push_back(LHS);
8771 RHSElts.push_back(RHS);
8772 }
8773
8774 // Limit to shifts by uniform immediates.
8775 // TODO: Only accept vXi8/vXi64 special cases?
8776 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8777 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8778 return SDValue();
8779
8780 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8781 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8782 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8783
8784 if (!IsShift)
8785 return Res;
8786
8787 // Immediately lower the shift to ensure the constant build vector doesn't
8788 // get converted to a constant pool before the shift is lowered.
8789 return LowerShift(Res, Subtarget, DAG);
8790}
8791
8792static bool isShuffleFoldableLoad(SDValue);
8793
8794/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8795/// representing a blend.
8797 X86Subtarget const &Subtarget,
8798 SelectionDAG &DAG) {
8799 MVT VT = BVOp->getSimpleValueType(0u);
8800
8801 if (VT != MVT::v4f64)
8802 return SDValue();
8803
8804 // Collect unique operands.
8805 auto UniqueOps = SmallSet<SDValue, 16u>();
8806 for (SDValue Op : BVOp->ops()) {
8807 if (isIntOrFPConstant(Op) || Op.isUndef())
8808 return SDValue();
8809 UniqueOps.insert(Op);
8810 }
8811
8812 // Candidate BUILD_VECTOR must have 2 unique operands.
8813 if (UniqueOps.size() != 2u)
8814 return SDValue();
8815
8816 SDValue Op0 = BVOp->getOperand(0u);
8817 UniqueOps.erase(Op0);
8818 SDValue Op1 = *UniqueOps.begin();
8819
8820 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8821 isShuffleFoldableLoad(Op1)) {
8822 // Create shuffle mask.
8823 auto const NumElems = VT.getVectorNumElements();
8824 SmallVector<int, 16u> Mask(NumElems);
8825 for (auto I = 0u; I < NumElems; ++I) {
8826 SDValue Op = BVOp->getOperand(I);
8827 Mask[I] = Op == Op0 ? I : I + NumElems;
8828 }
8829 // Create shuffle of splats.
8830 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8831 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8832 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8833 }
8834
8835 return SDValue();
8836}
8837
8838/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8839/// functionality to do this, so it's all zeros, all ones, or some derivation
8840/// that is cheap to calculate.
8842 SelectionDAG &DAG,
8843 const X86Subtarget &Subtarget) {
8844 MVT VT = Op.getSimpleValueType();
8845
8846 // Vectors containing all zeros can be matched by pxor and xorps.
8847 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8848 return Op;
8849
8850 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8851 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8852 // vpcmpeqd on 256-bit vectors.
8853 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8854 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8855 return Op;
8856
8857 return getOnesVector(VT, DAG, DL);
8858 }
8859
8860 return SDValue();
8861}
8862
8863/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8864/// from a vector of source values and a vector of extraction indices.
8865/// The vectors might be manipulated to match the type of the permute op.
8866static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8867 const SDLoc &DL, SelectionDAG &DAG,
8868 const X86Subtarget &Subtarget) {
8869 MVT ShuffleVT = VT;
8870 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8871 unsigned NumElts = VT.getVectorNumElements();
8872 unsigned SizeInBits = VT.getSizeInBits();
8873
8874 // Adjust IndicesVec to match VT size.
8875 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8876 "Illegal variable permute mask size");
8877 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8878 // Narrow/widen the indices vector to the correct size.
8879 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8880 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8881 NumElts * VT.getScalarSizeInBits());
8882 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8883 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8884 SDLoc(IndicesVec), SizeInBits);
8885 // Zero-extend the index elements within the vector.
8886 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8887 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8888 IndicesVT, IndicesVec);
8889 }
8890 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8891
8892 // Handle SrcVec that don't match VT type.
8893 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8894 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8895 // Handle larger SrcVec by treating it as a larger permute.
8896 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8897 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8898 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8899 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8900 Subtarget, DAG, SDLoc(IndicesVec));
8901 SDValue NewSrcVec =
8902 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8903 if (NewSrcVec)
8904 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8905 return SDValue();
8906 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8907 // Widen smaller SrcVec to match VT.
8908 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8909 } else
8910 return SDValue();
8911 }
8912
8913 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8914 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8915 EVT SrcVT = Idx.getValueType();
8916 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8917 uint64_t IndexScale = 0;
8918 uint64_t IndexOffset = 0;
8919
8920 // If we're scaling a smaller permute op, then we need to repeat the
8921 // indices, scaling and offsetting them as well.
8922 // e.g. v4i32 -> v16i8 (Scale = 4)
8923 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8924 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8925 for (uint64_t i = 0; i != Scale; ++i) {
8926 IndexScale |= Scale << (i * NumDstBits);
8927 IndexOffset |= i << (i * NumDstBits);
8928 }
8929
8930 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8931 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8932 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8933 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8934 return Idx;
8935 };
8936
8937 unsigned Opcode = 0;
8938 switch (VT.SimpleTy) {
8939 default:
8940 break;
8941 case MVT::v16i8:
8942 if (Subtarget.hasSSSE3())
8943 Opcode = X86ISD::PSHUFB;
8944 break;
8945 case MVT::v8i16:
8946 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8947 Opcode = X86ISD::VPERMV;
8948 else if (Subtarget.hasSSSE3()) {
8949 Opcode = X86ISD::PSHUFB;
8950 ShuffleVT = MVT::v16i8;
8951 }
8952 break;
8953 case MVT::v4f32:
8954 case MVT::v4i32:
8955 if (Subtarget.hasAVX()) {
8956 Opcode = X86ISD::VPERMILPV;
8957 ShuffleVT = MVT::v4f32;
8958 } else if (Subtarget.hasSSSE3()) {
8959 Opcode = X86ISD::PSHUFB;
8960 ShuffleVT = MVT::v16i8;
8961 }
8962 break;
8963 case MVT::v2f64:
8964 case MVT::v2i64:
8965 if (Subtarget.hasAVX()) {
8966 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8967 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8968 Opcode = X86ISD::VPERMILPV;
8969 ShuffleVT = MVT::v2f64;
8970 } else if (Subtarget.hasSSE41()) {
8971 // SSE41 can compare v2i64 - select between indices 0 and 1.
8972 return DAG.getSelectCC(
8973 DL, IndicesVec,
8974 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8975 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8976 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8978 }
8979 break;
8980 case MVT::v32i8:
8981 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8982 Opcode = X86ISD::VPERMV;
8983 else if (Subtarget.hasXOP()) {
8984 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8985 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8986 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8987 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8988 return DAG.getNode(
8990 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8991 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8992 } else if (Subtarget.hasAVX()) {
8993 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8994 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8995 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8996 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8997 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8999 // Permute Lo and Hi and then select based on index range.
9000 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9001 // care about the bit[7] as its just an index vector.
9002 SDValue Idx = Ops[2];
9003 EVT VT = Idx.getValueType();
9004 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9005 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9006 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9008 };
9009 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9010 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9011 PSHUFBBuilder);
9012 }
9013 break;
9014 case MVT::v16i16:
9015 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9016 Opcode = X86ISD::VPERMV;
9017 else if (Subtarget.hasAVX()) {
9018 // Scale to v32i8 and perform as v32i8.
9019 IndicesVec = ScaleIndices(IndicesVec, 2);
9020 return DAG.getBitcast(
9022 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9023 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9024 }
9025 break;
9026 case MVT::v8f32:
9027 case MVT::v8i32:
9028 if (Subtarget.hasAVX2())
9029 Opcode = X86ISD::VPERMV;
9030 else if (Subtarget.hasAVX()) {
9031 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9032 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9033 {0, 1, 2, 3, 0, 1, 2, 3});
9034 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9035 {4, 5, 6, 7, 4, 5, 6, 7});
9036 if (Subtarget.hasXOP())
9037 return DAG.getBitcast(
9038 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9039 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9040 // Permute Lo and Hi and then select based on index range.
9041 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9042 SDValue Res = DAG.getSelectCC(
9043 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9044 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9045 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9047 return DAG.getBitcast(VT, Res);
9048 }
9049 break;
9050 case MVT::v4i64:
9051 case MVT::v4f64:
9052 if (Subtarget.hasAVX512()) {
9053 if (!Subtarget.hasVLX()) {
9054 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9055 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9056 SDLoc(SrcVec));
9057 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9058 DAG, SDLoc(IndicesVec));
9059 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9060 DAG, Subtarget);
9061 return extract256BitVector(Res, 0, DAG, DL);
9062 }
9063 Opcode = X86ISD::VPERMV;
9064 } else if (Subtarget.hasAVX()) {
9065 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9066 SDValue LoLo =
9067 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9068 SDValue HiHi =
9069 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9070 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9071 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9072 if (Subtarget.hasXOP())
9073 return DAG.getBitcast(
9074 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9075 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9076 // Permute Lo and Hi and then select based on index range.
9077 // This works as VPERMILPD only uses index bit[1] to permute elements.
9078 SDValue Res = DAG.getSelectCC(
9079 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9080 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9081 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9083 return DAG.getBitcast(VT, Res);
9084 }
9085 break;
9086 case MVT::v64i8:
9087 if (Subtarget.hasVBMI())
9088 Opcode = X86ISD::VPERMV;
9089 break;
9090 case MVT::v32i16:
9091 if (Subtarget.hasBWI())
9092 Opcode = X86ISD::VPERMV;
9093 break;
9094 case MVT::v16f32:
9095 case MVT::v16i32:
9096 case MVT::v8f64:
9097 case MVT::v8i64:
9098 if (Subtarget.hasAVX512())
9099 Opcode = X86ISD::VPERMV;
9100 break;
9101 }
9102 if (!Opcode)
9103 return SDValue();
9104
9105 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9106 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9107 "Illegal variable permute shuffle type");
9108
9109 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9110 if (Scale > 1)
9111 IndicesVec = ScaleIndices(IndicesVec, Scale);
9112
9113 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9114 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9115
9116 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9117 SDValue Res = Opcode == X86ISD::VPERMV
9118 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9119 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9120 return DAG.getBitcast(VT, Res);
9121}
9122
9123// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9124// reasoned to be a permutation of a vector by indices in a non-constant vector.
9125// (build_vector (extract_elt V, (extract_elt I, 0)),
9126// (extract_elt V, (extract_elt I, 1)),
9127// ...
9128// ->
9129// (vpermv I, V)
9130//
9131// TODO: Handle undefs
9132// TODO: Utilize pshufb and zero mask blending to support more efficient
9133// construction of vectors with constant-0 elements.
9134static SDValue
9136 SelectionDAG &DAG,
9137 const X86Subtarget &Subtarget) {
9138 SDValue SrcVec, IndicesVec;
9139
9140 auto PeekThroughFreeze = [](SDValue N) {
9141 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9142 return N->getOperand(0);
9143 return N;
9144 };
9145 // Check for a match of the permute source vector and permute index elements.
9146 // This is done by checking that the i-th build_vector operand is of the form:
9147 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9148 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9149 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9150 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9151 return SDValue();
9152
9153 // If this is the first extract encountered in V, set the source vector,
9154 // otherwise verify the extract is from the previously defined source
9155 // vector.
9156 if (!SrcVec)
9157 SrcVec = Op.getOperand(0);
9158 else if (SrcVec != Op.getOperand(0))
9159 return SDValue();
9160 SDValue ExtractedIndex = Op->getOperand(1);
9161 // Peek through extends.
9162 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9163 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9164 ExtractedIndex = ExtractedIndex.getOperand(0);
9165 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9166 return SDValue();
9167
9168 // If this is the first extract from the index vector candidate, set the
9169 // indices vector, otherwise verify the extract is from the previously
9170 // defined indices vector.
9171 if (!IndicesVec)
9172 IndicesVec = ExtractedIndex.getOperand(0);
9173 else if (IndicesVec != ExtractedIndex.getOperand(0))
9174 return SDValue();
9175
9176 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9177 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9178 return SDValue();
9179 }
9180
9181 MVT VT = V.getSimpleValueType();
9182 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9183}
9184
9185SDValue
9186X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9187 SDLoc dl(Op);
9188
9189 MVT VT = Op.getSimpleValueType();
9190 MVT EltVT = VT.getVectorElementType();
9191 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9192 unsigned NumElems = Op.getNumOperands();
9193
9194 // Generate vectors for predicate vectors.
9195 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9196 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9197
9198 if (VT.getVectorElementType() == MVT::bf16 &&
9199 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9200 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9201
9202 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9203 return VectorCst;
9204
9205 unsigned EVTBits = EltVT.getSizeInBits();
9206 APInt UndefMask = APInt::getZero(NumElems);
9207 APInt FrozenUndefMask = APInt::getZero(NumElems);
9208 APInt ZeroMask = APInt::getZero(NumElems);
9209 APInt NonZeroMask = APInt::getZero(NumElems);
9210 bool IsAllConstants = true;
9211 bool OneUseFrozenUndefs = true;
9212 SmallSet<SDValue, 8> Values;
9213 unsigned NumConstants = NumElems;
9214 for (unsigned i = 0; i < NumElems; ++i) {
9215 SDValue Elt = Op.getOperand(i);
9216 if (Elt.isUndef()) {
9217 UndefMask.setBit(i);
9218 continue;
9219 }
9220 if (ISD::isFreezeUndef(Elt.getNode())) {
9221 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9222 FrozenUndefMask.setBit(i);
9223 continue;
9224 }
9225 Values.insert(Elt);
9226 if (!isIntOrFPConstant(Elt)) {
9227 IsAllConstants = false;
9228 NumConstants--;
9229 }
9230 if (X86::isZeroNode(Elt)) {
9231 ZeroMask.setBit(i);
9232 } else {
9233 NonZeroMask.setBit(i);
9234 }
9235 }
9236
9237 // All undef vector. Return an UNDEF.
9238 if (UndefMask.isAllOnes())
9239 return DAG.getUNDEF(VT);
9240
9241 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9242 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9243 return DAG.getFreeze(DAG.getUNDEF(VT));
9244
9245 // All undef/freeze(undef)/zero vector. Return a zero vector.
9246 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9247 return getZeroVector(VT, Subtarget, DAG, dl);
9248
9249 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9250 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9251 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9252 // and blend the FREEZE-UNDEF operands back in.
9253 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9254 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9255 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9256 SmallVector<int, 16> BlendMask(NumElems, -1);
9257 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9258 for (unsigned i = 0; i < NumElems; ++i) {
9259 if (UndefMask[i]) {
9260 BlendMask[i] = -1;
9261 continue;
9262 }
9263 BlendMask[i] = i;
9264 if (!FrozenUndefMask[i])
9265 Elts[i] = Op.getOperand(i);
9266 else
9267 BlendMask[i] += NumElems;
9268 }
9269 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9270 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9271 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9272 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9273 }
9274
9275 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9276
9277 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9278 // be better off lowering to a smaller build vector and padding with
9279 // undef/zero.
9280 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9282 unsigned UpperElems = NumElems / 2;
9283 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9284 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9285 if (NumUpperUndefsOrZeros >= UpperElems) {
9286 if (VT.is512BitVector() &&
9287 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9288 UpperElems = NumElems - (NumElems / 4);
9289 // If freeze(undef) is in any upper elements, force to zero.
9290 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9291 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9292 SDValue NewBV =
9293 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9294 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9295 }
9296 }
9297
9298 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9299 return AddSub;
9300 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9301 return HorizontalOp;
9302 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9303 return Broadcast;
9304 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9305 return BitOp;
9306 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9307 return Blend;
9308
9309 unsigned NumZero = ZeroMask.popcount();
9310 unsigned NumNonZero = NonZeroMask.popcount();
9311
9312 // If we are inserting one variable into a vector of non-zero constants, try
9313 // to avoid loading each constant element as a scalar. Load the constants as a
9314 // vector and then insert the variable scalar element. If insertion is not
9315 // supported, fall back to a shuffle to get the scalar blended with the
9316 // constants. Insertion into a zero vector is handled as a special-case
9317 // somewhere below here.
9318 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9319 FrozenUndefMask.isZero() &&
9322 // Create an all-constant vector. The variable element in the old
9323 // build vector is replaced by undef in the constant vector. Save the
9324 // variable scalar element and its index for use in the insertelement.
9325 LLVMContext &Context = *DAG.getContext();
9326 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9327 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9328 SDValue VarElt;
9329 SDValue InsIndex;
9330 for (unsigned i = 0; i != NumElems; ++i) {
9331 SDValue Elt = Op.getOperand(i);
9332 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9333 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9334 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9335 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9336 else if (!Elt.isUndef()) {
9337 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9338 "Expected one variable element in this vector");
9339 VarElt = Elt;
9340 InsIndex = DAG.getVectorIdxConstant(i, dl);
9341 }
9342 }
9343 Constant *CV = ConstantVector::get(ConstVecOps);
9344 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9345
9346 // The constants we just created may not be legal (eg, floating point). We
9347 // must lower the vector right here because we can not guarantee that we'll
9348 // legalize it before loading it. This is also why we could not just create
9349 // a new build vector here. If the build vector contains illegal constants,
9350 // it could get split back up into a series of insert elements.
9351 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9352 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9353 MachineFunction &MF = DAG.getMachineFunction();
9354 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9355 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9356 unsigned InsertC = InsIndex->getAsZExtVal();
9357 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9358 if (InsertC < NumEltsInLow128Bits)
9359 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9360
9361 // There's no good way to insert into the high elements of a >128-bit
9362 // vector, so use shuffles to avoid an extract/insert sequence.
9363 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9364 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9365 SmallVector<int, 8> ShuffleMask;
9366 unsigned NumElts = VT.getVectorNumElements();
9367 for (unsigned i = 0; i != NumElts; ++i)
9368 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9369 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9370 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9371 }
9372
9373 // Special case for single non-zero, non-undef, element.
9374 if (NumNonZero == 1) {
9375 unsigned Idx = NonZeroMask.countr_zero();
9376 SDValue Item = Op.getOperand(Idx);
9377
9378 // If we have a constant or non-constant insertion into the low element of
9379 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9380 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9381 // depending on what the source datatype is.
9382 if (Idx == 0) {
9383 if (NumZero == 0)
9384 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9385
9386 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9387 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9388 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9389 assert((VT.is128BitVector() || VT.is256BitVector() ||
9390 VT.is512BitVector()) &&
9391 "Expected an SSE value type!");
9392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9393 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9394 // zero vector.
9395 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9396 }
9397
9398 // We can't directly insert an i8 or i16 into a vector, so zero extend
9399 // it to i32 first.
9400 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9401 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9402 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9403 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9404 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9405 return DAG.getBitcast(VT, Item);
9406 }
9407 }
9408
9409 // Is it a vector logical left shift?
9410 if (NumElems == 2 && Idx == 1 &&
9411 X86::isZeroNode(Op.getOperand(0)) &&
9412 !X86::isZeroNode(Op.getOperand(1))) {
9413 unsigned NumBits = VT.getSizeInBits();
9414 return getVShift(true, VT,
9416 VT, Op.getOperand(1)),
9417 NumBits/2, DAG, *this, dl);
9418 }
9419
9420 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9421 return SDValue();
9422
9423 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9424 // is a non-constant being inserted into an element other than the low one,
9425 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9426 // movd/movss) to move this into the low element, then shuffle it into
9427 // place.
9428 if (EVTBits == 32) {
9429 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9430 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9431 }
9432 }
9433
9434 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9435 if (Values.size() == 1) {
9436 if (EVTBits == 32) {
9437 // Instead of a shuffle like this:
9438 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9439 // Check if it's possible to issue this instead.
9440 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9441 unsigned Idx = NonZeroMask.countr_zero();
9442 SDValue Item = Op.getOperand(Idx);
9443 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9444 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9445 }
9446 return SDValue();
9447 }
9448
9449 // A vector full of immediates; various special cases are already
9450 // handled, so this is best done with a single constant-pool load.
9451 if (IsAllConstants)
9452 return SDValue();
9453
9454 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9455 return V;
9456
9457 // See if we can use a vector load to get all of the elements.
9458 {
9459 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9460 if (SDValue LD =
9461 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9462 return LD;
9463 }
9464
9465 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9466 // build_vector and broadcast it.
9467 // TODO: We could probably generalize this more.
9468 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9469 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9470 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9471 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9472 // Make sure all the even/odd operands match.
9473 for (unsigned i = 2; i != NumElems; ++i)
9474 if (Ops[i % 2] != Op.getOperand(i))
9475 return false;
9476 return true;
9477 };
9478 if (CanSplat(Op, NumElems, Ops)) {
9479 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9480 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9481 // Create a new build vector and cast to v2i64/v2f64.
9482 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9483 DAG.getBuildVector(NarrowVT, dl, Ops));
9484 // Broadcast from v2i64/v2f64 and cast to final VT.
9485 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9486 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9487 NewBV));
9488 }
9489 }
9490
9491 // For AVX-length vectors, build the individual 128-bit pieces and use
9492 // shuffles to put them in place.
9493 if (VT.getSizeInBits() > 128) {
9494 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9495
9496 // Build both the lower and upper subvector.
9497 SDValue Lower =
9498 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9500 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9501
9502 // Recreate the wider vector with the lower and upper part.
9503 return concatSubVectors(Lower, Upper, DAG, dl);
9504 }
9505
9506 // Let legalizer expand 2-wide build_vectors.
9507 if (EVTBits == 64) {
9508 if (NumNonZero == 1) {
9509 // One half is zero or undef.
9510 unsigned Idx = NonZeroMask.countr_zero();
9511 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9512 Op.getOperand(Idx));
9513 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9514 }
9515 return SDValue();
9516 }
9517
9518 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9519 if (EVTBits == 8 && NumElems == 16)
9520 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9521 NumZero, DAG, Subtarget))
9522 return V;
9523
9524 if (EltVT == MVT::i16 && NumElems == 8)
9525 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9526 NumZero, DAG, Subtarget))
9527 return V;
9528
9529 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9530 if (EVTBits == 32 && NumElems == 4)
9531 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9532 return V;
9533
9534 // If element VT is == 32 bits, turn it into a number of shuffles.
9535 if (NumElems == 4 && NumZero > 0) {
9536 SmallVector<SDValue, 8> Ops(NumElems);
9537 for (unsigned i = 0; i < 4; ++i) {
9538 bool isZero = !NonZeroMask[i];
9539 if (isZero)
9540 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9541 else
9542 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9543 }
9544
9545 for (unsigned i = 0; i < 2; ++i) {
9546 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9547 default: llvm_unreachable("Unexpected NonZero count");
9548 case 0:
9549 Ops[i] = Ops[i*2]; // Must be a zero vector.
9550 break;
9551 case 1:
9552 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9553 break;
9554 case 2:
9555 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9556 break;
9557 case 3:
9558 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9559 break;
9560 }
9561 }
9562
9563 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9564 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9565 int MaskVec[] = {
9566 Reverse1 ? 1 : 0,
9567 Reverse1 ? 0 : 1,
9568 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9569 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9570 };
9571 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9572 }
9573
9574 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9575
9576 // Check for a build vector from mostly shuffle plus few inserting.
9577 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9578 return Sh;
9579
9580 // For SSE 4.1, use insertps to put the high elements into the low element.
9581 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9583 if (!Op.getOperand(0).isUndef())
9584 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9585 else
9586 Result = DAG.getUNDEF(VT);
9587
9588 for (unsigned i = 1; i < NumElems; ++i) {
9589 if (Op.getOperand(i).isUndef()) continue;
9590 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9591 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9592 }
9593 return Result;
9594 }
9595
9596 // Otherwise, expand into a number of unpckl*, start by extending each of
9597 // our (non-undef) elements to the full vector width with the element in the
9598 // bottom slot of the vector (which generates no code for SSE).
9599 SmallVector<SDValue, 8> Ops(NumElems);
9600 for (unsigned i = 0; i < NumElems; ++i) {
9601 if (!Op.getOperand(i).isUndef())
9602 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9603 else
9604 Ops[i] = DAG.getUNDEF(VT);
9605 }
9606
9607 // Next, we iteratively mix elements, e.g. for v4f32:
9608 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9609 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9610 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9611 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9612 // Generate scaled UNPCKL shuffle mask.
9613 SmallVector<int, 16> Mask;
9614 for(unsigned i = 0; i != Scale; ++i)
9615 Mask.push_back(i);
9616 for (unsigned i = 0; i != Scale; ++i)
9617 Mask.push_back(NumElems+i);
9618 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9619
9620 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9621 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9622 }
9623 return Ops[0];
9624}
9625
9626// 256-bit AVX can use the vinsertf128 instruction
9627// to create 256-bit vectors from two other 128-bit ones.
9628// TODO: Detect subvector broadcast here instead of DAG combine?
9630 SelectionDAG &DAG,
9631 const X86Subtarget &Subtarget) {
9632 MVT ResVT = Op.getSimpleValueType();
9633 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9634 "Value type must be 256-/512-bit wide");
9635
9636 unsigned NumOperands = Op.getNumOperands();
9637 unsigned NumFreezeUndef = 0;
9638 unsigned NumZero = 0;
9639 unsigned NumNonZero = 0;
9640 unsigned NonZeros = 0;
9641 SmallSet<SDValue, 4> Undefs;
9642 for (unsigned i = 0; i != NumOperands; ++i) {
9643 SDValue SubVec = Op.getOperand(i);
9644 if (SubVec.isUndef())
9645 continue;
9646 if (ISD::isFreezeUndef(SubVec.getNode())) {
9647 // If the freeze(undef) has multiple uses then we must fold to zero.
9648 if (SubVec.hasOneUse()) {
9649 ++NumFreezeUndef;
9650 } else {
9651 ++NumZero;
9652 Undefs.insert(SubVec);
9653 }
9654 }
9655 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9656 ++NumZero;
9657 else {
9658 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9659 NonZeros |= 1 << i;
9660 ++NumNonZero;
9661 }
9662 }
9663
9664 // If we have more than 2 non-zeros, build each half separately.
9665 if (NumNonZero > 2) {
9666 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9667 ArrayRef<SDUse> Ops = Op->ops();
9668 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9669 Ops.slice(0, NumOperands/2));
9670 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9671 Ops.slice(NumOperands/2));
9672 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9673 }
9674
9675 // Otherwise, build it up through insert_subvectors.
9676 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9677 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9678 : DAG.getUNDEF(ResVT));
9679
9680 // Replace Undef operands with ZeroVector.
9681 for (SDValue U : Undefs)
9683 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9684
9685 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9686 unsigned NumSubElems = SubVT.getVectorNumElements();
9687 for (unsigned i = 0; i != NumOperands; ++i) {
9688 if ((NonZeros & (1 << i)) == 0)
9689 continue;
9690
9691 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9692 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9693 }
9694
9695 return Vec;
9696}
9697
9698// Returns true if the given node is a type promotion (by concatenating i1
9699// zeros) of the result of a node that already zeros all upper bits of
9700// k-register.
9701// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9703 const X86Subtarget &Subtarget,
9704 SelectionDAG & DAG) {
9705 MVT ResVT = Op.getSimpleValueType();
9706 unsigned NumOperands = Op.getNumOperands();
9707 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9708 "Unexpected number of operands in CONCAT_VECTORS");
9709
9710 uint64_t Zeros = 0;
9711 uint64_t NonZeros = 0;
9712 for (unsigned i = 0; i != NumOperands; ++i) {
9713 SDValue SubVec = Op.getOperand(i);
9714 if (SubVec.isUndef())
9715 continue;
9716 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9717 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9718 Zeros |= (uint64_t)1 << i;
9719 else
9720 NonZeros |= (uint64_t)1 << i;
9721 }
9722
9723 unsigned NumElems = ResVT.getVectorNumElements();
9724
9725 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9726 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9727 // insert_subvector will give us two kshifts.
9728 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9729 Log2_64(NonZeros) != NumOperands - 1) {
9730 unsigned Idx = Log2_64(NonZeros);
9731 SDValue SubVec = Op.getOperand(Idx);
9732 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9733 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9734 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9735 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9736 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9737 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9738 DAG.getVectorIdxConstant(0, dl));
9739 }
9740
9741 // If there are zero or one non-zeros we can handle this very simply.
9742 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9743 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9744 if (!NonZeros)
9745 return Vec;
9746 unsigned Idx = Log2_64(NonZeros);
9747 SDValue SubVec = Op.getOperand(Idx);
9748 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9749 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9750 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9751 }
9752
9753 if (NumOperands > 2) {
9754 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9755 ArrayRef<SDUse> Ops = Op->ops();
9756 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9757 Ops.slice(0, NumOperands / 2));
9758 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9759 Ops.slice(NumOperands / 2));
9760 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9761 }
9762
9763 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9764
9765 if (ResVT.getVectorNumElements() >= 16)
9766 return Op; // The operation is legal with KUNPCK
9767
9768 SDValue Vec =
9769 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9770 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9771 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9772 DAG.getVectorIdxConstant(NumElems / 2, dl));
9773}
9774
9776 const X86Subtarget &Subtarget,
9777 SelectionDAG &DAG) {
9778 SDLoc DL(Op);
9779 MVT VT = Op.getSimpleValueType();
9780 if (VT.getVectorElementType() == MVT::i1)
9781 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9782
9783 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9784 // from two other 128-bit ones.
9785 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9786 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9787 (VT.is512BitVector() &&
9788 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9789 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9790}
9791
9792//===----------------------------------------------------------------------===//
9793// Vector shuffle lowering
9794//
9795// This is an experimental code path for lowering vector shuffles on x86. It is
9796// designed to handle arbitrary vector shuffles and blends, gracefully
9797// degrading performance as necessary. It works hard to recognize idiomatic
9798// shuffles and lower them to optimal instruction patterns without leaving
9799// a framework that allows reasonably efficient handling of all vector shuffle
9800// patterns.
9801//===----------------------------------------------------------------------===//
9802
9803/// Checks whether the vector elements referenced by two shuffle masks are
9804/// equivalent.
9805static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9806 int Idx, int ExpectedIdx) {
9807 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9808 ExpectedIdx < MaskSize && "Out of range element index");
9809 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9810 return false;
9811
9812 EVT VT = Op.getValueType();
9813 EVT ExpectedVT = ExpectedOp.getValueType();
9814
9815 // Sources must be vectors and match the mask's element count.
9816 if (!VT.isVector() || !ExpectedVT.isVector() ||
9817 (int)VT.getVectorNumElements() != MaskSize ||
9818 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9819 return false;
9820
9821 // Exact match.
9822 if (Idx == ExpectedIdx && Op == ExpectedOp)
9823 return true;
9824
9825 switch (Op.getOpcode()) {
9826 case ISD::BUILD_VECTOR:
9827 // If the values are build vectors, we can look through them to find
9828 // equivalent inputs that make the shuffles equivalent.
9829 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9830 case ISD::BITCAST: {
9832 EVT SrcVT = Src.getValueType();
9833 if (Op == ExpectedOp && SrcVT.isVector()) {
9834 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9835 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9836 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9837 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9838 Idx / Scale, ExpectedIdx / Scale);
9839 }
9840 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9841 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9842 for (unsigned I = 0; I != Scale; ++I)
9843 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9844 (Idx * Scale) + I,
9845 (ExpectedIdx * Scale) + I))
9846 return false;
9847 return true;
9848 }
9849 }
9850 break;
9851 }
9852 case ISD::VECTOR_SHUFFLE: {
9853 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9854 return Op == ExpectedOp &&
9855 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9856 }
9857 case X86ISD::VBROADCAST:
9859 return Op == ExpectedOp;
9861 if (Op == ExpectedOp) {
9862 auto *MemOp = cast<MemSDNode>(Op);
9863 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9864 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9865 }
9866 break;
9867 case X86ISD::VPERMI: {
9868 if (Op == ExpectedOp) {
9870 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9871 SDValue Src = Op.getOperand(0);
9872 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9873 Mask[ExpectedIdx]);
9874 }
9875 break;
9876 }
9877 case X86ISD::HADD:
9878 case X86ISD::HSUB:
9879 case X86ISD::FHADD:
9880 case X86ISD::FHSUB:
9881 case X86ISD::PACKSS:
9882 case X86ISD::PACKUS:
9883 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9884 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9885 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9886 int NumElts = VT.getVectorNumElements();
9887 int NumLanes = VT.getSizeInBits() / 128;
9888 int NumEltsPerLane = NumElts / NumLanes;
9889 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9890 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9891 bool SameElt =
9892 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9893 return SameLane && SameElt;
9894 }
9895 break;
9896 }
9897
9898 return false;
9899}
9900
9901/// Tiny helper function to identify a no-op mask.
9902///
9903/// This is a somewhat boring predicate function. It checks whether the mask
9904/// array input, which is assumed to be a single-input shuffle mask of the kind
9905/// used by the X86 shuffle instructions (not a fully general
9906/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9907/// in-place shuffle are 'no-op's.
9909 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9910 assert(Mask[i] >= -1 && "Out of bound mask element!");
9911 if (Mask[i] >= 0 && Mask[i] != i)
9912 return false;
9913 }
9914 return true;
9915}
9916
9917/// Test whether there are elements crossing LaneSizeInBits lanes in this
9918/// shuffle mask.
9919///
9920/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9921/// and we routinely test for these.
9922static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9923 unsigned ScalarSizeInBits,
9924 ArrayRef<int> Mask) {
9925 assert(LaneSizeInBits && ScalarSizeInBits &&
9926 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9927 "Illegal shuffle lane size");
9928 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9929 int Size = Mask.size();
9930 for (int i = 0; i < Size; ++i)
9931 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9932 return true;
9933 return false;
9934}
9935
9936/// Test whether there are elements crossing 128-bit lanes in this
9937/// shuffle mask.
9939 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9940}
9941
9942/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9943/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9944/// better support 'repeated mask + lane permute' style shuffles.
9945static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9946 unsigned ScalarSizeInBits,
9947 ArrayRef<int> Mask) {
9948 assert(LaneSizeInBits && ScalarSizeInBits &&
9949 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9950 "Illegal shuffle lane size");
9951 int NumElts = Mask.size();
9952 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9953 int NumLanes = NumElts / NumEltsPerLane;
9954 if (NumLanes > 1) {
9955 for (int i = 0; i != NumLanes; ++i) {
9956 int SrcLane = -1;
9957 for (int j = 0; j != NumEltsPerLane; ++j) {
9958 int M = Mask[(i * NumEltsPerLane) + j];
9959 if (M < 0)
9960 continue;
9961 int Lane = (M % NumElts) / NumEltsPerLane;
9962 if (SrcLane >= 0 && SrcLane != Lane)
9963 return true;
9964 SrcLane = Lane;
9965 }
9966 }
9967 }
9968 return false;
9969}
9970
9971/// Test whether a shuffle mask is equivalent within each sub-lane.
9972///
9973/// This checks a shuffle mask to see if it is performing the same
9974/// lane-relative shuffle in each sub-lane. This trivially implies
9975/// that it is also not lane-crossing. It may however involve a blend from the
9976/// same lane of a second vector.
9977///
9978/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9979/// non-trivial to compute in the face of undef lanes. The representation is
9980/// suitable for use with existing 128-bit shuffles as entries from the second
9981/// vector have been remapped to [LaneSize, 2*LaneSize).
9982static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9983 ArrayRef<int> Mask,
9984 SmallVectorImpl<int> &RepeatedMask) {
9985 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9986 RepeatedMask.assign(LaneSize, -1);
9987 int Size = Mask.size();
9988 for (int i = 0; i < Size; ++i) {
9989 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9990 if (Mask[i] < 0)
9991 continue;
9992 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9993 // This entry crosses lanes, so there is no way to model this shuffle.
9994 return false;
9995
9996 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9997 // Adjust second vector indices to start at LaneSize instead of Size.
9998 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9999 : Mask[i] % LaneSize + LaneSize;
10000 if (RepeatedMask[i % LaneSize] < 0)
10001 // This is the first non-undef entry in this slot of a 128-bit lane.
10002 RepeatedMask[i % LaneSize] = LocalM;
10003 else if (RepeatedMask[i % LaneSize] != LocalM)
10004 // Found a mismatch with the repeated mask.
10005 return false;
10006 }
10007 return true;
10008}
10009
10010/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10011static bool
10013 SmallVectorImpl<int> &RepeatedMask) {
10014 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10015}
10016
10017static bool
10019 SmallVector<int, 32> RepeatedMask;
10020 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10021}
10022
10023/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10024static bool
10026 SmallVectorImpl<int> &RepeatedMask) {
10027 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10028}
10029
10030/// Test whether a target shuffle mask is equivalent within each sub-lane.
10031/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10032static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10033 unsigned EltSizeInBits,
10034 ArrayRef<int> Mask,
10035 SmallVectorImpl<int> &RepeatedMask) {
10036 int LaneSize = LaneSizeInBits / EltSizeInBits;
10037 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10038 int Size = Mask.size();
10039 for (int i = 0; i < Size; ++i) {
10040 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10041 if (Mask[i] == SM_SentinelUndef)
10042 continue;
10043 if (Mask[i] == SM_SentinelZero) {
10044 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10045 return false;
10046 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10047 continue;
10048 }
10049 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10050 // This entry crosses lanes, so there is no way to model this shuffle.
10051 return false;
10052
10053 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10054 // later vector indices to start at multiples of LaneSize instead of Size.
10055 int LaneM = Mask[i] / Size;
10056 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10057 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10058 // This is the first non-undef entry in this slot of a 128-bit lane.
10059 RepeatedMask[i % LaneSize] = LocalM;
10060 else if (RepeatedMask[i % LaneSize] != LocalM)
10061 // Found a mismatch with the repeated mask.
10062 return false;
10063 }
10064 return true;
10065}
10066
10067/// Test whether a target shuffle mask is equivalent within each sub-lane.
10068/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10069static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10070 ArrayRef<int> Mask,
10071 SmallVectorImpl<int> &RepeatedMask) {
10072 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10073 Mask, RepeatedMask);
10074}
10075
10076/// Checks whether a shuffle mask is equivalent to an explicit list of
10077/// arguments.
10078///
10079/// This is a fast way to test a shuffle mask against a fixed pattern:
10080///
10081/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10082///
10083/// It returns true if the mask is exactly as wide as the argument list, and
10084/// each element of the mask is either -1 (signifying undef) or the value given
10085/// in the argument.
10086static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10087 SDValue V1 = SDValue(),
10088 SDValue V2 = SDValue()) {
10089 int Size = Mask.size();
10090 if (Size != (int)ExpectedMask.size())
10091 return false;
10092
10093 for (int i = 0; i < Size; ++i) {
10094 assert(Mask[i] >= -1 && "Out of bound mask element!");
10095 int MaskIdx = Mask[i];
10096 int ExpectedIdx = ExpectedMask[i];
10097 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10098 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10099 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10100 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10101 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10102 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10103 return false;
10104 }
10105 }
10106 return true;
10107}
10108
10109/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10110///
10111/// The masks must be exactly the same width.
10112///
10113/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10114/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10115///
10116/// SM_SentinelZero is accepted as a valid negative index but must match in
10117/// both, or via a known bits test.
10119 ArrayRef<int> ExpectedMask,
10120 const SelectionDAG &DAG,
10121 SDValue V1 = SDValue(),
10122 SDValue V2 = SDValue()) {
10123 int Size = Mask.size();
10124 if (Size != (int)ExpectedMask.size())
10125 return false;
10126 assert(llvm::all_of(ExpectedMask,
10127 [Size](int M) {
10128 return M == SM_SentinelZero ||
10129 isInRange(M, 0, 2 * Size);
10130 }) &&
10131 "Illegal target shuffle mask");
10132
10133 // Check for out-of-range target shuffle mask indices.
10134 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10135 return false;
10136
10137 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10138 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10139 !V1.getValueType().isVector()))
10140 V1 = SDValue();
10141 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10142 !V2.getValueType().isVector()))
10143 V2 = SDValue();
10144
10145 APInt ZeroV1 = APInt::getZero(Size);
10146 APInt ZeroV2 = APInt::getZero(Size);
10147
10148 for (int i = 0; i < Size; ++i) {
10149 int MaskIdx = Mask[i];
10150 int ExpectedIdx = ExpectedMask[i];
10151 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10152 continue;
10153 // If we failed to match an expected SM_SentinelZero then early out.
10154 if (ExpectedIdx < 0)
10155 return false;
10156 if (MaskIdx == SM_SentinelZero) {
10157 // If we need this expected index to be a zero element, then update the
10158 // relevant zero mask and perform the known bits at the end to minimize
10159 // repeated computes.
10160 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10161 if (ExpectedV &&
10162 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10163 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10164 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10165 ZeroMask.setBit(BitIdx);
10166 continue;
10167 }
10168 }
10169 if (MaskIdx >= 0) {
10170 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10171 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10172 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10173 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10174 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10175 continue;
10176 }
10177 return false;
10178 }
10179 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10180 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10181}
10182
10183// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10184// instructions.
10186 const SelectionDAG &DAG) {
10187 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10188 return false;
10189
10190 SmallVector<int, 8> Unpcklwd;
10191 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10192 /* Unary = */ false);
10193 SmallVector<int, 8> Unpckhwd;
10194 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10195 /* Unary = */ false);
10196 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10197 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10198 return IsUnpackwdMask;
10199}
10200
10202 const SelectionDAG &DAG) {
10203 // Create 128-bit vector type based on mask size.
10204 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10205 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10206
10207 // We can't assume a canonical shuffle mask, so try the commuted version too.
10208 SmallVector<int, 4> CommutedMask(Mask);
10210
10211 // Match any of unary/binary or low/high.
10212 for (unsigned i = 0; i != 4; ++i) {
10213 SmallVector<int, 16> UnpackMask;
10214 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10215 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10216 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10217 return true;
10218 }
10219 return false;
10220}
10221
10222/// Return true if a shuffle mask chooses elements identically in its top and
10223/// bottom halves. For example, any splat mask has the same top and bottom
10224/// halves. If an element is undefined in only one half of the mask, the halves
10225/// are not considered identical.
10227 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10228 unsigned HalfSize = Mask.size() / 2;
10229 for (unsigned i = 0; i != HalfSize; ++i) {
10230 if (Mask[i] != Mask[i + HalfSize])
10231 return false;
10232 }
10233 return true;
10234}
10235
10236/// Get a 4-lane 8-bit shuffle immediate for a mask.
10237///
10238/// This helper function produces an 8-bit shuffle immediate corresponding to
10239/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10240/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10241/// example.
10242///
10243/// NB: We rely heavily on "undef" masks preserving the input lane.
10244static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10245 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10246 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10247 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10248 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10249 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10250
10251 // If the mask only uses one non-undef element, then fully 'splat' it to
10252 // improve later broadcast matching.
10253 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10254 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10255
10256 int FirstElt = Mask[FirstIndex];
10257 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10258 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10259
10260 unsigned Imm = 0;
10261 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10262 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10263 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10264 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10265 return Imm;
10266}
10267
10269 SelectionDAG &DAG) {
10270 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10271}
10272
10273// Canonicalize SHUFPD mask to improve chances of further folding.
10274// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10275static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10276 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10277 "Unexpected SHUFPD mask size");
10278 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10279 "Unexpected SHUFPD mask elements");
10280
10281 // If the mask only uses one non-undef element, then fully 'splat' it to
10282 // improve later broadcast matching.
10283 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10284 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10285 "All undef shuffle mask");
10286
10287 int FirstElt = Mask[FirstIndex];
10288 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10289 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10290 unsigned Imm = 0;
10291 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10292 Imm |= FirstElt << I;
10293 return Imm;
10294 }
10295
10296 // Attempt to keep any undef elements in place to improve chances of the
10297 // shuffle becoming a (commutative) blend.
10298 unsigned Imm = 0;
10299 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10300 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10301
10302 return Imm;
10303}
10304
10306 SelectionDAG &DAG) {
10307 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10308}
10309
10310// The Shuffle result is as follow:
10311// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10312// Each Zeroable's element correspond to a particular Mask's element.
10313// As described in computeZeroableShuffleElements function.
10314//
10315// The function looks for a sub-mask that the nonzero elements are in
10316// increasing order. If such sub-mask exist. The function returns true.
10317static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10318 ArrayRef<int> Mask, const EVT &VectorType,
10319 bool &IsZeroSideLeft) {
10320 int NextElement = -1;
10321 // Check if the Mask's nonzero elements are in increasing order.
10322 for (int i = 0, e = Mask.size(); i < e; i++) {
10323 // Checks if the mask's zeros elements are built from only zeros.
10324 assert(Mask[i] >= -1 && "Out of bound mask element!");
10325 if (Mask[i] < 0)
10326 return false;
10327 if (Zeroable[i])
10328 continue;
10329 // Find the lowest non zero element
10330 if (NextElement < 0) {
10331 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10332 IsZeroSideLeft = NextElement != 0;
10333 }
10334 // Exit if the mask's non zero elements are not in increasing order.
10335 if (NextElement != Mask[i])
10336 return false;
10337 NextElement++;
10338 }
10339 return true;
10340}
10341
10342static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10344 const X86Subtarget &Subtarget,
10345 unsigned Depth = 0);
10346
10347/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10349 ArrayRef<int> Mask, SDValue V1,
10350 SDValue V2, const APInt &Zeroable,
10351 const X86Subtarget &Subtarget,
10352 SelectionDAG &DAG) {
10353 int Size = Mask.size();
10354 int LaneSize = 128 / VT.getScalarSizeInBits();
10355 const int NumBytes = VT.getSizeInBits() / 8;
10356 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10357
10358 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10359 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10360 (Subtarget.hasBWI() && VT.is512BitVector()));
10361
10362 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10363 // Sign bit set in i8 mask means zero element.
10364 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10365
10366 SDValue V;
10367 for (int i = 0; i < NumBytes; ++i) {
10368 int M = Mask[i / NumEltBytes];
10369 if (M < 0) {
10370 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10371 continue;
10372 }
10373 if (Zeroable[i / NumEltBytes]) {
10374 PSHUFBMask[i] = ZeroMask;
10375 continue;
10376 }
10377
10378 // We can only use a single input of V1 or V2.
10379 SDValue SrcV = (M >= Size ? V2 : V1);
10380 if (V && V != SrcV)
10381 return SDValue();
10382 V = SrcV;
10383 M %= Size;
10384
10385 // PSHUFB can't cross lanes, ensure this doesn't happen.
10386 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10387 return SDValue();
10388
10389 M = M % LaneSize;
10390 M = M * NumEltBytes + (i % NumEltBytes);
10391 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10392 }
10393 assert(V && "Failed to find a source input");
10394
10395 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10396 return DAG.getBitcast(
10397 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10398 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10399}
10400
10401static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10402 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10403 const SDLoc &dl);
10404
10405// X86 has dedicated shuffle that can be lowered to VEXPAND
10407 SDValue V2, ArrayRef<int> Mask,
10408 const APInt &Zeroable,
10409 const X86Subtarget &Subtarget,
10410 SelectionDAG &DAG) {
10411 bool IsLeftZeroSide = true;
10412 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10413 IsLeftZeroSide))
10414 return SDValue();
10415 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10417 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10418 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10419 unsigned NumElts = VT.getVectorNumElements();
10420 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10421 "Unexpected number of vector elements");
10422 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10423 Subtarget, DAG, DL);
10424 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10425 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10426 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10427}
10428
10429static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10430 unsigned &UnpackOpcode, bool IsUnary,
10431 ArrayRef<int> TargetMask, const SDLoc &DL,
10432 SelectionDAG &DAG,
10433 const X86Subtarget &Subtarget) {
10434 int NumElts = VT.getVectorNumElements();
10435
10436 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10437 for (int i = 0; i != NumElts; i += 2) {
10438 int M1 = TargetMask[i + 0];
10439 int M2 = TargetMask[i + 1];
10440 Undef1 &= (SM_SentinelUndef == M1);
10441 Undef2 &= (SM_SentinelUndef == M2);
10442 Zero1 &= isUndefOrZero(M1);
10443 Zero2 &= isUndefOrZero(M2);
10444 }
10445 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10446 "Zeroable shuffle detected");
10447
10448 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10449 SmallVector<int, 64> Unpckl, Unpckh;
10450 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10451 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10452 (IsUnary ? V1 : V2))) {
10453 UnpackOpcode = X86ISD::UNPCKL;
10454 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10455 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10456 return true;
10457 }
10458
10459 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10460 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10461 (IsUnary ? V1 : V2))) {
10462 UnpackOpcode = X86ISD::UNPCKH;
10463 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10464 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10465 return true;
10466 }
10467
10468 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10469 if (IsUnary && (Zero1 || Zero2)) {
10470 // Don't bother if we can blend instead.
10471 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10472 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10473 return false;
10474
10475 bool MatchLo = true, MatchHi = true;
10476 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10477 int M = TargetMask[i];
10478
10479 // Ignore if the input is known to be zero or the index is undef.
10480 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10481 (M == SM_SentinelUndef))
10482 continue;
10483
10484 MatchLo &= (M == Unpckl[i]);
10485 MatchHi &= (M == Unpckh[i]);
10486 }
10487
10488 if (MatchLo || MatchHi) {
10489 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10490 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10491 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10492 return true;
10493 }
10494 }
10495
10496 // If a binary shuffle, commute and try again.
10497 if (!IsUnary) {
10499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10500 UnpackOpcode = X86ISD::UNPCKL;
10501 std::swap(V1, V2);
10502 return true;
10503 }
10504
10506 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10507 UnpackOpcode = X86ISD::UNPCKH;
10508 std::swap(V1, V2);
10509 return true;
10510 }
10511 }
10512
10513 return false;
10514}
10515
10516// X86 has dedicated unpack instructions that can handle specific blend
10517// operations: UNPCKH and UNPCKL.
10519 SDValue V2, ArrayRef<int> Mask,
10520 SelectionDAG &DAG) {
10521 SmallVector<int, 8> Unpckl;
10522 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10523 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10524 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10525
10526 SmallVector<int, 8> Unpckh;
10527 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10528 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10529 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10530
10531 // Commute and try again.
10533 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10534 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10535
10537 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10539
10540 return SDValue();
10541}
10542
10543/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10544/// followed by unpack 256-bit.
10546 SDValue V2, ArrayRef<int> Mask,
10547 SelectionDAG &DAG) {
10548 SmallVector<int, 32> Unpckl, Unpckh;
10549 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10550 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10551
10552 unsigned UnpackOpcode;
10553 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10554 UnpackOpcode = X86ISD::UNPCKL;
10555 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10556 UnpackOpcode = X86ISD::UNPCKH;
10557 else
10558 return SDValue();
10559
10560 // This is a "natural" unpack operation (rather than the 128-bit sectored
10561 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10562 // input in order to use the x86 instruction.
10563 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10564 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10565 V1 = DAG.getBitcast(VT, V1);
10566 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10567}
10568
10569// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10570// source into the lower elements and zeroing the upper elements.
10571static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10572 ArrayRef<int> Mask, const APInt &Zeroable,
10573 const X86Subtarget &Subtarget) {
10574 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10575 return false;
10576
10577 unsigned NumElts = Mask.size();
10578 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10579 unsigned MaxScale = 64 / EltSizeInBits;
10580
10581 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10582 unsigned SrcEltBits = EltSizeInBits * Scale;
10583 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10584 continue;
10585 unsigned NumSrcElts = NumElts / Scale;
10586 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10587 continue;
10588 unsigned UpperElts = NumElts - NumSrcElts;
10589 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10590 continue;
10591 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10592 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10593 DstVT = MVT::getIntegerVT(EltSizeInBits);
10594 if ((NumSrcElts * EltSizeInBits) >= 128) {
10595 // ISD::TRUNCATE
10596 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10597 } else {
10598 // X86ISD::VTRUNC
10599 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10600 }
10601 return true;
10602 }
10603
10604 return false;
10605}
10606
10607// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10608// element padding to the final DstVT.
10609static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10610 const X86Subtarget &Subtarget,
10611 SelectionDAG &DAG, bool ZeroUppers) {
10612 MVT SrcVT = Src.getSimpleValueType();
10613 MVT DstSVT = DstVT.getScalarType();
10614 unsigned NumDstElts = DstVT.getVectorNumElements();
10615 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10616 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10617
10618 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10619 return SDValue();
10620
10621 // Perform a direct ISD::TRUNCATE if possible.
10622 if (NumSrcElts == NumDstElts)
10623 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10624
10625 if (NumSrcElts > NumDstElts) {
10626 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10627 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10628 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10629 }
10630
10631 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10632 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10633 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10634 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10635 DstVT.getSizeInBits());
10636 }
10637
10638 // Non-VLX targets must truncate from a 512-bit type, so we need to
10639 // widen, truncate and then possibly extract the original subvector.
10640 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10641 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10642 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10643 }
10644
10645 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10647 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10648 if (DstVT != TruncVT)
10649 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10650 DstVT.getSizeInBits());
10651 return Trunc;
10652}
10653
10654// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10655//
10656// An example is the following:
10657//
10658// t0: ch = EntryToken
10659// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10660// t25: v4i32 = truncate t2
10661// t41: v8i16 = bitcast t25
10662// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10663// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10664// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10665// t18: v2i64 = bitcast t51
10666//
10667// One can just use a single vpmovdw instruction, without avx512vl we need to
10668// use the zmm variant and extract the lower subvector, padding with zeroes.
10669// TODO: Merge with lowerShuffleAsVTRUNC.
10671 SDValue V2, ArrayRef<int> Mask,
10672 const APInt &Zeroable,
10673 const X86Subtarget &Subtarget,
10674 SelectionDAG &DAG) {
10675 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10676 if (!Subtarget.hasAVX512())
10677 return SDValue();
10678
10679 unsigned NumElts = VT.getVectorNumElements();
10680 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10681 unsigned MaxScale = 64 / EltSizeInBits;
10682 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10683 unsigned SrcEltBits = EltSizeInBits * Scale;
10684 unsigned NumSrcElts = NumElts / Scale;
10685 unsigned UpperElts = NumElts - NumSrcElts;
10686 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10687 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10688 continue;
10689
10690 // Attempt to find a matching source truncation, but as a fall back VLX
10691 // cases can use the VPMOV directly.
10692 SDValue Src = peekThroughBitcasts(V1);
10693 if (Src.getOpcode() == ISD::TRUNCATE &&
10694 Src.getScalarValueSizeInBits() == SrcEltBits) {
10695 Src = Src.getOperand(0);
10696 } else if (Subtarget.hasVLX()) {
10697 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10698 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10699 Src = DAG.getBitcast(SrcVT, Src);
10700 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10701 if (Scale == 2 &&
10702 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10703 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10704 return SDValue();
10705 } else
10706 return SDValue();
10707
10708 // VPMOVWB is only available with avx512bw.
10709 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10710 return SDValue();
10711
10712 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10713 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10714 }
10715
10716 return SDValue();
10717}
10718
10719// Attempt to match binary shuffle patterns as a truncate.
10721 SDValue V2, ArrayRef<int> Mask,
10722 const APInt &Zeroable,
10723 const X86Subtarget &Subtarget,
10724 SelectionDAG &DAG) {
10725 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10726 "Unexpected VTRUNC type");
10727 if (!Subtarget.hasAVX512() ||
10728 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10729 return SDValue();
10730
10731 unsigned NumElts = VT.getVectorNumElements();
10732 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10733 unsigned MaxScale = 64 / EltSizeInBits;
10734 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10735 // TODO: Support non-BWI VPMOVWB truncations?
10736 unsigned SrcEltBits = EltSizeInBits * Scale;
10737 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10738 continue;
10739
10740 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10741 // Bail if the V2 elements are undef.
10742 unsigned NumHalfSrcElts = NumElts / Scale;
10743 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10744 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10745 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10746 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10747 continue;
10748
10749 // The elements beyond the truncation must be undef/zero.
10750 unsigned UpperElts = NumElts - NumSrcElts;
10751 if (UpperElts > 0 &&
10752 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10753 continue;
10754 bool UndefUppers =
10755 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10756
10757 // As we're using both sources then we need to concat them together
10758 // and truncate from the double-sized src.
10759 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10760
10761 // For offset truncations, ensure that the concat is cheap.
10762 SDValue Src =
10763 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10764 if (!Src) {
10765 if (Offset)
10766 continue;
10767 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10768 }
10769
10770 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10771 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10772 Src = DAG.getBitcast(SrcVT, Src);
10773
10774 // Shift the offset'd elements into place for the truncation.
10775 // TODO: Use getTargetVShiftByConstNode.
10776 if (Offset)
10777 Src = DAG.getNode(
10778 X86ISD::VSRLI, DL, SrcVT, Src,
10779 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10780
10781 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10782 }
10783 }
10784
10785 return SDValue();
10786}
10787
10788/// Check whether a compaction lowering can be done by dropping even/odd
10789/// elements and compute how many times even/odd elements must be dropped.
10790///
10791/// This handles shuffles which take every Nth element where N is a power of
10792/// two. Example shuffle masks:
10793///
10794/// (even)
10795/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10796/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10797/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10798/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10799/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10800/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10801///
10802/// (odd)
10803/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10804/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10805///
10806/// Any of these lanes can of course be undef.
10807///
10808/// This routine only supports N <= 3.
10809/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10810/// for larger N.
10811///
10812/// \returns N above, or the number of times even/odd elements must be dropped
10813/// if there is such a number. Otherwise returns zero.
10814static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10815 bool IsSingleInput) {
10816 // The modulus for the shuffle vector entries is based on whether this is
10817 // a single input or not.
10818 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10819 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10820 "We should only be called with masks with a power-of-2 size!");
10821
10822 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10823 int Offset = MatchEven ? 0 : 1;
10824
10825 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10826 // and 2^3 simultaneously. This is because we may have ambiguity with
10827 // partially undef inputs.
10828 bool ViableForN[3] = {true, true, true};
10829
10830 for (int i = 0, e = Mask.size(); i < e; ++i) {
10831 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10832 // want.
10833 if (Mask[i] < 0)
10834 continue;
10835
10836 bool IsAnyViable = false;
10837 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10838 if (ViableForN[j]) {
10839 uint64_t N = j + 1;
10840
10841 // The shuffle mask must be equal to (i * 2^N) % M.
10842 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10843 IsAnyViable = true;
10844 else
10845 ViableForN[j] = false;
10846 }
10847 // Early exit if we exhaust the possible powers of two.
10848 if (!IsAnyViable)
10849 break;
10850 }
10851
10852 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10853 if (ViableForN[j])
10854 return j + 1;
10855
10856 // Return 0 as there is no viable power of two.
10857 return 0;
10858}
10859
10860// X86 has dedicated pack instructions that can handle specific truncation
10861// operations: PACKSS and PACKUS.
10862// Checks for compaction shuffle masks if MaxStages > 1.
10863// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10864static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10865 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10866 const SelectionDAG &DAG,
10867 const X86Subtarget &Subtarget,
10868 unsigned MaxStages = 1) {
10869 unsigned NumElts = VT.getVectorNumElements();
10870 unsigned BitSize = VT.getScalarSizeInBits();
10871 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10872 "Illegal maximum compaction");
10873
10874 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10875 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10876 unsigned NumPackedBits = NumSrcBits - BitSize;
10877 N1 = peekThroughBitcasts(N1);
10878 N2 = peekThroughBitcasts(N2);
10879 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10880 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10881 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10882 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10883 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10884 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10885 return false;
10886 if (Subtarget.hasSSE41() || BitSize == 8) {
10887 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10888 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10889 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10890 V1 = N1;
10891 V2 = N2;
10892 SrcVT = PackVT;
10893 PackOpcode = X86ISD::PACKUS;
10894 return true;
10895 }
10896 }
10897 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10898 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10899 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10900 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10901 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10902 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10903 V1 = N1;
10904 V2 = N2;
10905 SrcVT = PackVT;
10906 PackOpcode = X86ISD::PACKSS;
10907 return true;
10908 }
10909 return false;
10910 };
10911
10912 // Attempt to match against wider and wider compaction patterns.
10913 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10914 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10915 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10916
10917 // Try binary shuffle.
10918 SmallVector<int, 32> BinaryMask;
10919 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10920 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10921 if (MatchPACK(V1, V2, PackVT))
10922 return true;
10923
10924 // Try unary shuffle.
10925 SmallVector<int, 32> UnaryMask;
10926 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10927 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10928 if (MatchPACK(V1, V1, PackVT))
10929 return true;
10930 }
10931
10932 return false;
10933}
10934
10936 SDValue V2, ArrayRef<int> Mask,
10937 const X86Subtarget &Subtarget,
10938 SelectionDAG &DAG) {
10939 MVT PackVT;
10940 unsigned PackOpcode;
10941 unsigned SizeBits = VT.getSizeInBits();
10942 unsigned EltBits = VT.getScalarSizeInBits();
10943 unsigned MaxStages = Log2_32(64 / EltBits);
10944 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10945 Subtarget, MaxStages))
10946 return SDValue();
10947
10948 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10949 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10950
10951 // Don't lower multi-stage packs on AVX512, truncation is better.
10952 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10953 return SDValue();
10954
10955 // Pack to the largest type possible:
10956 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10957 unsigned MaxPackBits = 16;
10958 if (CurrentEltBits > 16 &&
10959 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10960 MaxPackBits = 32;
10961
10962 // Repeatedly pack down to the target size.
10963 SDValue Res;
10964 for (unsigned i = 0; i != NumStages; ++i) {
10965 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10966 unsigned NumSrcElts = SizeBits / SrcEltBits;
10967 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10968 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10969 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10970 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10971 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10972 DAG.getBitcast(SrcVT, V2));
10973 V1 = V2 = Res;
10974 CurrentEltBits /= 2;
10975 }
10976 assert(Res && Res.getValueType() == VT &&
10977 "Failed to lower compaction shuffle");
10978 return Res;
10979}
10980
10981/// Try to emit a bitmask instruction for a shuffle.
10982///
10983/// This handles cases where we can model a blend exactly as a bitmask due to
10984/// one of the inputs being zeroable.
10986 SDValue V2, ArrayRef<int> Mask,
10987 const APInt &Zeroable,
10988 const X86Subtarget &Subtarget,
10989 SelectionDAG &DAG) {
10990 MVT MaskVT = VT;
10991 MVT EltVT = VT.getVectorElementType();
10992 SDValue Zero, AllOnes;
10993 // Use f64 if i64 isn't legal.
10994 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10995 EltVT = MVT::f64;
10996 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10997 }
10998
10999 MVT LogicVT = VT;
11000 if (EltVT.isFloatingPoint()) {
11001 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11002 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11003 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11004 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11005 } else {
11006 Zero = DAG.getConstant(0, DL, EltVT);
11007 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11008 }
11009
11010 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11011 SDValue V;
11012 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11013 if (Zeroable[i])
11014 continue;
11015 if (Mask[i] % Size != i)
11016 return SDValue(); // Not a blend.
11017 if (!V)
11018 V = Mask[i] < Size ? V1 : V2;
11019 else if (V != (Mask[i] < Size ? V1 : V2))
11020 return SDValue(); // Can only let one input through the mask.
11021
11022 VMaskOps[i] = AllOnes;
11023 }
11024 if (!V)
11025 return SDValue(); // No non-zeroable elements!
11026
11027 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11028 VMask = DAG.getBitcast(LogicVT, VMask);
11029 V = DAG.getBitcast(LogicVT, V);
11030 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11031 return DAG.getBitcast(VT, And);
11032}
11033
11034/// Try to emit a blend instruction for a shuffle using bit math.
11035///
11036/// This is used as a fallback approach when first class blend instructions are
11037/// unavailable. Currently it is only suitable for integer vectors, but could
11038/// be generalized for floating point vectors if desirable.
11040 SDValue V2, ArrayRef<int> Mask,
11041 SelectionDAG &DAG) {
11042 assert(VT.isInteger() && "Only supports integer vector types!");
11043 MVT EltVT = VT.getVectorElementType();
11044 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11045 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11047 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11048 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11049 return SDValue(); // Shuffled input!
11050 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11051 }
11052
11053 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11054 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11055}
11056
11058 SDValue PreservedSrc,
11059 const X86Subtarget &Subtarget,
11060 SelectionDAG &DAG);
11061
11064 const APInt &Zeroable, bool &ForceV1Zero,
11065 bool &ForceV2Zero, uint64_t &BlendMask) {
11066 bool V1IsZeroOrUndef =
11068 bool V2IsZeroOrUndef =
11070
11071 BlendMask = 0;
11072 ForceV1Zero = false, ForceV2Zero = false;
11073 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11074
11075 int NumElts = Mask.size();
11076 int NumLanes = VT.getSizeInBits() / 128;
11077 int NumEltsPerLane = NumElts / NumLanes;
11078 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11079
11080 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11081 // then ensure the blend mask part for that lane just references that input.
11082 bool ForceWholeLaneMasks =
11083 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11084
11085 // Attempt to generate the binary blend mask. If an input is zero then
11086 // we can use any lane.
11087 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11088 // Keep track of the inputs used per lane.
11089 bool LaneV1InUse = false;
11090 bool LaneV2InUse = false;
11091 uint64_t LaneBlendMask = 0;
11092 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11093 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11094 int M = Mask[Elt];
11095 if (M == SM_SentinelUndef)
11096 continue;
11097 if (M == Elt || (0 <= M && M < NumElts &&
11098 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11099 Mask[Elt] = Elt;
11100 LaneV1InUse = true;
11101 continue;
11102 }
11103 if (M == (Elt + NumElts) ||
11104 (NumElts <= M &&
11105 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11106 LaneBlendMask |= 1ull << LaneElt;
11107 Mask[Elt] = Elt + NumElts;
11108 LaneV2InUse = true;
11109 continue;
11110 }
11111 if (Zeroable[Elt]) {
11112 if (V1IsZeroOrUndef) {
11113 ForceV1Zero = true;
11114 Mask[Elt] = Elt;
11115 LaneV1InUse = true;
11116 continue;
11117 }
11118 if (V2IsZeroOrUndef) {
11119 ForceV2Zero = true;
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 }
11126 return false;
11127 }
11128
11129 // If we only used V2 then splat the lane blend mask to avoid any demanded
11130 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11131 // blend mask bit).
11132 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11133 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11134
11135 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11136 }
11137 return true;
11138}
11139
11140/// Try to emit a blend instruction for a shuffle.
11141///
11142/// This doesn't do any checks for the availability of instructions for blending
11143/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11144/// be matched in the backend with the type given. What it does check for is
11145/// that the shuffle mask is a blend, or convertible into a blend with zero.
11147 SDValue V2, ArrayRef<int> Original,
11148 const APInt &Zeroable,
11149 const X86Subtarget &Subtarget,
11150 SelectionDAG &DAG) {
11151 uint64_t BlendMask = 0;
11152 bool ForceV1Zero = false, ForceV2Zero = false;
11153 SmallVector<int, 64> Mask(Original);
11154 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11155 BlendMask))
11156 return SDValue();
11157
11158 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11159 if (ForceV1Zero)
11160 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11161 if (ForceV2Zero)
11162 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11163
11164 unsigned NumElts = VT.getVectorNumElements();
11165
11166 switch (VT.SimpleTy) {
11167 case MVT::v4i64:
11168 case MVT::v8i32:
11169 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11170 [[fallthrough]];
11171 case MVT::v4f64:
11172 case MVT::v8f32:
11173 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11174 [[fallthrough]];
11175 case MVT::v2f64:
11176 case MVT::v2i64:
11177 case MVT::v4f32:
11178 case MVT::v4i32:
11179 case MVT::v8i16:
11180 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11181 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11182 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11183 case MVT::v16i16: {
11184 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11185 SmallVector<int, 8> RepeatedMask;
11186 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11187 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11188 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11189 BlendMask = 0;
11190 for (int i = 0; i < 8; ++i)
11191 if (RepeatedMask[i] >= 8)
11192 BlendMask |= 1ull << i;
11193 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11194 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11195 }
11196 // Use PBLENDW for lower/upper lanes and then blend lanes.
11197 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11198 // merge to VSELECT where useful.
11199 uint64_t LoMask = BlendMask & 0xFF;
11200 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11201 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11202 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11203 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11204 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11205 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11206 return DAG.getVectorShuffle(
11207 MVT::v16i16, DL, Lo, Hi,
11208 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11209 }
11210 [[fallthrough]];
11211 }
11212 case MVT::v32i8:
11213 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11214 [[fallthrough]];
11215 case MVT::v16i8: {
11216 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11217
11218 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11219 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11220 Subtarget, DAG))
11221 return Masked;
11222
11223 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11224 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11225 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11226 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11227 }
11228
11229 // If we have VPTERNLOG, we can use that as a bit blend.
11230 if (Subtarget.hasVLX())
11231 if (SDValue BitBlend =
11232 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11233 return BitBlend;
11234
11235 // Scale the blend by the number of bytes per element.
11236 int Scale = VT.getScalarSizeInBits() / 8;
11237
11238 // This form of blend is always done on bytes. Compute the byte vector
11239 // type.
11240 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11241
11242 // x86 allows load folding with blendvb from the 2nd source operand. But
11243 // we are still using LLVM select here (see comment below), so that's V1.
11244 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11245 // allow that load-folding possibility.
11246 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11248 std::swap(V1, V2);
11249 }
11250
11251 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11252 // mix of LLVM's code generator and the x86 backend. We tell the code
11253 // generator that boolean values in the elements of an x86 vector register
11254 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11255 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11256 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11257 // of the element (the remaining are ignored) and 0 in that high bit would
11258 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11259 // the LLVM model for boolean values in vector elements gets the relevant
11260 // bit set, it is set backwards and over constrained relative to x86's
11261 // actual model.
11262 SmallVector<SDValue, 32> VSELECTMask;
11263 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11264 for (int j = 0; j < Scale; ++j)
11265 VSELECTMask.push_back(
11266 Mask[i] < 0
11267 ? DAG.getUNDEF(MVT::i8)
11268 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11269
11270 V1 = DAG.getBitcast(BlendVT, V1);
11271 V2 = DAG.getBitcast(BlendVT, V2);
11272 return DAG.getBitcast(
11273 VT,
11274 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11275 V1, V2));
11276 }
11277 case MVT::v16f32:
11278 case MVT::v8f64:
11279 case MVT::v8i64:
11280 case MVT::v16i32:
11281 case MVT::v32i16:
11282 case MVT::v64i8: {
11283 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11284 bool OptForSize = DAG.shouldOptForSize();
11285 if (!OptForSize) {
11286 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11287 Subtarget, DAG))
11288 return Masked;
11289 }
11290
11291 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11292 // masked move.
11293 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11294 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11295 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11296 }
11297 default:
11298 llvm_unreachable("Not a supported integer vector type!");
11299 }
11300}
11301
11302/// Try to lower as a blend of elements from two inputs followed by
11303/// a single-input permutation.
11304///
11305/// This matches the pattern where we can blend elements from two inputs and
11306/// then reduce the shuffle to a single-input permutation.
11308 SDValue V1, SDValue V2,
11309 ArrayRef<int> Mask,
11310 SelectionDAG &DAG,
11311 bool ImmBlends = false) {
11312 // We build up the blend mask while checking whether a blend is a viable way
11313 // to reduce the shuffle.
11314 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11315 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11316
11317 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11318 if (Mask[i] < 0)
11319 continue;
11320
11321 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11322
11323 if (BlendMask[Mask[i] % Size] < 0)
11324 BlendMask[Mask[i] % Size] = Mask[i];
11325 else if (BlendMask[Mask[i] % Size] != Mask[i])
11326 return SDValue(); // Can't blend in the needed input!
11327
11328 PermuteMask[i] = Mask[i] % Size;
11329 }
11330
11331 // If only immediate blends, then bail if the blend mask can't be widened to
11332 // i16.
11333 unsigned EltSize = VT.getScalarSizeInBits();
11334 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11335 return SDValue();
11336
11337 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11338 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11339}
11340
11341/// Try to lower as an unpack of elements from two inputs followed by
11342/// a single-input permutation.
11343///
11344/// This matches the pattern where we can unpack elements from two inputs and
11345/// then reduce the shuffle to a single-input (wider) permutation.
11347 SDValue V1, SDValue V2,
11348 ArrayRef<int> Mask,
11349 SelectionDAG &DAG) {
11350 int NumElts = Mask.size();
11351 int NumLanes = VT.getSizeInBits() / 128;
11352 int NumLaneElts = NumElts / NumLanes;
11353 int NumHalfLaneElts = NumLaneElts / 2;
11354
11355 bool MatchLo = true, MatchHi = true;
11356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11357
11358 // Determine UNPCKL/UNPCKH type and operand order.
11359 for (int Elt = 0; Elt != NumElts; ++Elt) {
11360 int M = Mask[Elt];
11361 if (M < 0)
11362 continue;
11363
11364 // Normalize the mask value depending on whether it's V1 or V2.
11365 int NormM = M;
11366 SDValue &Op = Ops[Elt & 1];
11367 if (M < NumElts && (Op.isUndef() || Op == V1))
11368 Op = V1;
11369 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11370 Op = V2;
11371 NormM -= NumElts;
11372 } else
11373 return SDValue();
11374
11375 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11376 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11377 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11378 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11379 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11380 if (MatchLoAnyLane || MatchHiAnyLane) {
11381 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11382 "Failed to match UNPCKLO/UNPCKHI");
11383 break;
11384 }
11385 }
11386 MatchLo &= MatchLoAnyLane;
11387 MatchHi &= MatchHiAnyLane;
11388 if (!MatchLo && !MatchHi)
11389 return SDValue();
11390 }
11391 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11392
11393 // Element indices have changed after unpacking. Calculate permute mask
11394 // so that they will be put back to the position as dictated by the
11395 // original shuffle mask indices.
11396 SmallVector<int, 32> PermuteMask(NumElts, -1);
11397 for (int Elt = 0; Elt != NumElts; ++Elt) {
11398 int M = Mask[Elt];
11399 if (M < 0)
11400 continue;
11401 int NormM = M;
11402 if (NumElts <= M)
11403 NormM -= NumElts;
11404 bool IsFirstOp = M < NumElts;
11405 int BaseMaskElt =
11406 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11407 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11408 PermuteMask[Elt] = BaseMaskElt;
11409 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11410 PermuteMask[Elt] = BaseMaskElt + 1;
11411 assert(PermuteMask[Elt] != -1 &&
11412 "Input mask element is defined but failed to assign permute mask");
11413 }
11414
11415 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11416 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11417 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11418}
11419
11420/// Try to lower a shuffle as a permute of the inputs followed by an
11421/// UNPCK instruction.
11422///
11423/// This specifically targets cases where we end up with alternating between
11424/// the two inputs, and so can permute them into something that feeds a single
11425/// UNPCK instruction. Note that this routine only targets integer vectors
11426/// because for floating point vectors we have a generalized SHUFPS lowering
11427/// strategy that handles everything that doesn't *exactly* match an unpack,
11428/// making this clever lowering unnecessary.
11430 SDValue V1, SDValue V2,
11431 ArrayRef<int> Mask,
11432 const X86Subtarget &Subtarget,
11433 SelectionDAG &DAG) {
11434 int Size = Mask.size();
11435 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11436
11437 // This routine only supports 128-bit integer dual input vectors.
11438 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11439 return SDValue();
11440
11441 int NumLoInputs =
11442 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11443 int NumHiInputs =
11444 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11445
11446 bool UnpackLo = NumLoInputs >= NumHiInputs;
11447
11448 auto TryUnpack = [&](int ScalarSize, int Scale) {
11449 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11450 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11451
11452 for (int i = 0; i < Size; ++i) {
11453 if (Mask[i] < 0)
11454 continue;
11455
11456 // Each element of the unpack contains Scale elements from this mask.
11457 int UnpackIdx = i / Scale;
11458
11459 // We only handle the case where V1 feeds the first slots of the unpack.
11460 // We rely on canonicalization to ensure this is the case.
11461 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11462 return SDValue();
11463
11464 // Setup the mask for this input. The indexing is tricky as we have to
11465 // handle the unpack stride.
11466 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11467 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11468 Mask[i] % Size;
11469 }
11470
11471 // If we will have to shuffle both inputs to use the unpack, check whether
11472 // we can just unpack first and shuffle the result. If so, skip this unpack.
11473 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11474 !isNoopShuffleMask(V2Mask))
11475 return SDValue();
11476
11477 // Shuffle the inputs into place.
11478 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11479 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11480
11481 // Cast the inputs to the type we will use to unpack them.
11482 MVT UnpackVT =
11483 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11484 V1 = DAG.getBitcast(UnpackVT, V1);
11485 V2 = DAG.getBitcast(UnpackVT, V2);
11486
11487 // Unpack the inputs and cast the result back to the desired type.
11488 return DAG.getBitcast(
11489 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11490 UnpackVT, V1, V2));
11491 };
11492
11493 // We try each unpack from the largest to the smallest to try and find one
11494 // that fits this mask.
11495 int OrigScalarSize = VT.getScalarSizeInBits();
11496 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11497 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11498 return Unpack;
11499
11500 // If we're shuffling with a zero vector then we're better off not doing
11501 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11504 return SDValue();
11505
11506 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11507 // initial unpack.
11508 if (NumLoInputs == 0 || NumHiInputs == 0) {
11509 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11510 "We have to have *some* inputs!");
11511 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11512
11513 // FIXME: We could consider the total complexity of the permute of each
11514 // possible unpacking. Or at the least we should consider how many
11515 // half-crossings are created.
11516 // FIXME: We could consider commuting the unpacks.
11517
11518 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11519 for (int i = 0; i < Size; ++i) {
11520 if (Mask[i] < 0)
11521 continue;
11522
11523 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11524
11525 PermMask[i] =
11526 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11527 }
11528 return DAG.getVectorShuffle(
11529 VT, DL,
11530 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11531 V1, V2),
11532 DAG.getUNDEF(VT), PermMask);
11533 }
11534
11535 return SDValue();
11536}
11537
11538/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11539/// permuting the elements of the result in place.
11541 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11542 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11543 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11544 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11545 (VT.is512BitVector() && !Subtarget.hasBWI()))
11546 return SDValue();
11547
11548 // We don't currently support lane crossing permutes.
11549 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11550 return SDValue();
11551
11552 int Scale = VT.getScalarSizeInBits() / 8;
11553 int NumLanes = VT.getSizeInBits() / 128;
11554 int NumElts = VT.getVectorNumElements();
11555 int NumEltsPerLane = NumElts / NumLanes;
11556
11557 // Determine range of mask elts.
11558 bool Blend1 = true;
11559 bool Blend2 = true;
11560 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11561 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11562 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11563 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11564 int M = Mask[Lane + Elt];
11565 if (M < 0)
11566 continue;
11567 if (M < NumElts) {
11568 Blend1 &= (M == (Lane + Elt));
11569 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11570 M = M % NumEltsPerLane;
11571 Range1.first = std::min(Range1.first, M);
11572 Range1.second = std::max(Range1.second, M);
11573 } else {
11574 M -= NumElts;
11575 Blend2 &= (M == (Lane + Elt));
11576 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11577 M = M % NumEltsPerLane;
11578 Range2.first = std::min(Range2.first, M);
11579 Range2.second = std::max(Range2.second, M);
11580 }
11581 }
11582 }
11583
11584 // Bail if we don't need both elements.
11585 // TODO - it might be worth doing this for unary shuffles if the permute
11586 // can be widened.
11587 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11588 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11589 return SDValue();
11590
11591 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11592 return SDValue();
11593
11594 // Rotate the 2 ops so we can access both ranges, then permute the result.
11595 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11596 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11597 SDValue Rotate = DAG.getBitcast(
11598 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11599 DAG.getBitcast(ByteVT, Lo),
11600 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11601 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11602 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11603 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11604 int M = Mask[Lane + Elt];
11605 if (M < 0)
11606 continue;
11607 if (M < NumElts)
11608 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11609 else
11610 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11611 }
11612 }
11613 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11614 };
11615
11616 // Check if the ranges are small enough to rotate from either direction.
11617 if (Range2.second < Range1.first)
11618 return RotateAndPermute(V1, V2, Range1.first, 0);
11619 if (Range1.second < Range2.first)
11620 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11621 return SDValue();
11622}
11623
11625 return isUndefOrEqual(Mask, 0);
11626}
11627
11629 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11630}
11631
11632/// Check if the Mask consists of the same element repeated multiple times.
11634 size_t NumUndefs = 0;
11635 std::optional<int> UniqueElt;
11636 for (int Elt : Mask) {
11637 if (Elt == SM_SentinelUndef) {
11638 NumUndefs++;
11639 continue;
11640 }
11641 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11642 return false;
11643 UniqueElt = Elt;
11644 }
11645 // Make sure the element is repeated enough times by checking the number of
11646 // undefs is small.
11647 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11648}
11649
11650/// Generic routine to decompose a shuffle and blend into independent
11651/// blends and permutes.
11652///
11653/// This matches the extremely common pattern for handling combined
11654/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11655/// operations. It will try to pick the best arrangement of shuffles and
11656/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11658 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11659 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11660 int NumElts = Mask.size();
11661 int NumLanes = VT.getSizeInBits() / 128;
11662 int NumEltsPerLane = NumElts / NumLanes;
11663
11664 // Shuffle the input elements into the desired positions in V1 and V2 and
11665 // unpack/blend them together.
11666 bool IsAlternating = true;
11667 bool V1Zero = true, V2Zero = true;
11668 SmallVector<int, 32> V1Mask(NumElts, -1);
11669 SmallVector<int, 32> V2Mask(NumElts, -1);
11670 SmallVector<int, 32> FinalMask(NumElts, -1);
11671 for (int i = 0; i < NumElts; ++i) {
11672 int M = Mask[i];
11673 if (M >= 0 && M < NumElts) {
11674 V1Mask[i] = M;
11675 FinalMask[i] = i;
11676 V1Zero &= Zeroable[i];
11677 IsAlternating &= (i & 1) == 0;
11678 } else if (M >= NumElts) {
11679 V2Mask[i] = M - NumElts;
11680 FinalMask[i] = i + NumElts;
11681 V2Zero &= Zeroable[i];
11682 IsAlternating &= (i & 1) == 1;
11683 }
11684 }
11685
11686 // If we effectively only demand the 0'th element of \p Input, and not only
11687 // as 0'th element, then broadcast said input,
11688 // and change \p InputMask to be a no-op (identity) mask.
11689 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11690 &DAG](SDValue &Input,
11691 MutableArrayRef<int> InputMask) {
11692 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11693 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11694 !X86::mayFoldLoad(Input, Subtarget)))
11695 return;
11696 if (isNoopShuffleMask(InputMask))
11697 return;
11698 assert(isBroadcastShuffleMask(InputMask) &&
11699 "Expected to demand only the 0'th element.");
11701 for (auto I : enumerate(InputMask)) {
11702 int &InputMaskElt = I.value();
11703 if (InputMaskElt >= 0)
11704 InputMaskElt = I.index();
11705 }
11706 };
11707
11708 // Currently, we may need to produce one shuffle per input, and blend results.
11709 // It is possible that the shuffle for one of the inputs is already a no-op.
11710 // See if we can simplify non-no-op shuffles into broadcasts,
11711 // which we consider to be strictly better than an arbitrary shuffle.
11712 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11714 canonicalizeBroadcastableInput(V1, V1Mask);
11715 canonicalizeBroadcastableInput(V2, V2Mask);
11716 }
11717
11718 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11719 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11720 // the shuffle may be able to fold with a load or other benefit. However, when
11721 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11722 // pre-shuffle first is a better strategy.
11723 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11724 // If we don't have blends, see if we can create a cheap unpack.
11725 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11726 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11727 is128BitUnpackShuffleMask(V2Mask, DAG)))
11728 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11729 DL, VT, V1, V2, Mask, Subtarget, DAG))
11730 return PermUnpack;
11731
11732 // Only prefer immediate blends to unpack/rotate.
11733 if (SDValue BlendPerm =
11734 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11735 return BlendPerm;
11736
11737 // If either input vector provides only a single element which is repeated
11738 // multiple times, unpacking from both input vectors would generate worse
11739 // code. e.g. for
11740 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11741 // it is better to process t4 first to create a vector of t4[0], then unpack
11742 // that vector with t2.
11743 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11745 if (SDValue UnpackPerm =
11746 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11747 return UnpackPerm;
11748
11750 DL, VT, V1, V2, Mask, Subtarget, DAG))
11751 return RotatePerm;
11752
11753 // Unpack/rotate failed - try again with variable blends.
11754 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11755 DAG))
11756 return BlendPerm;
11757
11758 if (VT.getScalarSizeInBits() >= 32)
11759 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11760 DL, VT, V1, V2, Mask, Subtarget, DAG))
11761 return PermUnpack;
11762 }
11763
11764 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11765 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11766 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11767 // than half the elements coming from each source.
11768 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11769 V1Mask.assign(NumElts, -1);
11770 V2Mask.assign(NumElts, -1);
11771 FinalMask.assign(NumElts, -1);
11772 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11773 for (int j = 0; j != NumEltsPerLane; ++j) {
11774 int M = Mask[i + j];
11775 if (M >= 0 && M < NumElts) {
11776 V1Mask[i + (j / 2)] = M;
11777 FinalMask[i + j] = i + (j / 2);
11778 } else if (M >= NumElts) {
11779 V2Mask[i + (j / 2)] = M - NumElts;
11780 FinalMask[i + j] = i + (j / 2) + NumElts;
11781 }
11782 }
11783 }
11784
11785 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11786 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11787 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11788}
11789
11790static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11791 const X86Subtarget &Subtarget,
11792 ArrayRef<int> Mask) {
11793 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11794 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11795
11796 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11797 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11798 int MaxSubElts = 64 / EltSizeInBits;
11799 unsigned RotateAmt, NumSubElts;
11800 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11801 MaxSubElts, NumSubElts, RotateAmt))
11802 return -1;
11803 unsigned NumElts = Mask.size();
11804 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11805 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11806 return RotateAmt;
11807}
11808
11809/// Lower shuffle using X86ISD::VROTLI rotations.
11811 ArrayRef<int> Mask,
11812 const X86Subtarget &Subtarget,
11813 SelectionDAG &DAG) {
11814 // Only XOP + AVX512 targets have bit rotation instructions.
11815 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11816 bool IsLegal =
11817 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11818 if (!IsLegal && Subtarget.hasSSE3())
11819 return SDValue();
11820
11821 MVT RotateVT;
11822 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11823 Subtarget, Mask);
11824 if (RotateAmt < 0)
11825 return SDValue();
11826
11827 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11828 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11829 // widen to vXi16 or more then existing lowering should will be better.
11830 if (!IsLegal) {
11831 if ((RotateAmt % 16) == 0)
11832 return SDValue();
11833 // TODO: Use getTargetVShiftByConstNode.
11834 unsigned ShlAmt = RotateAmt;
11835 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11836 V1 = DAG.getBitcast(RotateVT, V1);
11837 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11838 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11839 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11840 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11841 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11842 return DAG.getBitcast(VT, Rot);
11843 }
11844
11845 SDValue Rot =
11846 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11847 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11848 return DAG.getBitcast(VT, Rot);
11849}
11850
11851/// Try to match a vector shuffle as an element rotation.
11852///
11853/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11855 ArrayRef<int> Mask) {
11856 int NumElts = Mask.size();
11857
11858 // We need to detect various ways of spelling a rotation:
11859 // [11, 12, 13, 14, 15, 0, 1, 2]
11860 // [-1, 12, 13, 14, -1, -1, 1, -1]
11861 // [-1, -1, -1, -1, -1, -1, 1, 2]
11862 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11863 // [-1, 4, 5, 6, -1, -1, 9, -1]
11864 // [-1, 4, 5, 6, -1, -1, -1, -1]
11865 int Rotation = 0;
11866 SDValue Lo, Hi;
11867 for (int i = 0; i < NumElts; ++i) {
11868 int M = Mask[i];
11869 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11870 "Unexpected mask index.");
11871 if (M < 0)
11872 continue;
11873
11874 // Determine where a rotated vector would have started.
11875 int StartIdx = i - (M % NumElts);
11876 if (StartIdx == 0)
11877 // The identity rotation isn't interesting, stop.
11878 return -1;
11879
11880 // If we found the tail of a vector the rotation must be the missing
11881 // front. If we found the head of a vector, it must be how much of the
11882 // head.
11883 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11884
11885 if (Rotation == 0)
11886 Rotation = CandidateRotation;
11887 else if (Rotation != CandidateRotation)
11888 // The rotations don't match, so we can't match this mask.
11889 return -1;
11890
11891 // Compute which value this mask is pointing at.
11892 SDValue MaskV = M < NumElts ? V1 : V2;
11893
11894 // Compute which of the two target values this index should be assigned
11895 // to. This reflects whether the high elements are remaining or the low
11896 // elements are remaining.
11897 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11898
11899 // Either set up this value if we've not encountered it before, or check
11900 // that it remains consistent.
11901 if (!TargetV)
11902 TargetV = MaskV;
11903 else if (TargetV != MaskV)
11904 // This may be a rotation, but it pulls from the inputs in some
11905 // unsupported interleaving.
11906 return -1;
11907 }
11908
11909 // Check that we successfully analyzed the mask, and normalize the results.
11910 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11911 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11912 if (!Lo)
11913 Lo = Hi;
11914 else if (!Hi)
11915 Hi = Lo;
11916
11917 V1 = Lo;
11918 V2 = Hi;
11919
11920 return Rotation;
11921}
11922
11923/// Try to lower a vector shuffle as a byte rotation.
11924///
11925/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11926/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11927/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11928/// try to generically lower a vector shuffle through such an pattern. It
11929/// does not check for the profitability of lowering either as PALIGNR or
11930/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11931/// This matches shuffle vectors that look like:
11932///
11933/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11934///
11935/// Essentially it concatenates V1 and V2, shifts right by some number of
11936/// elements, and takes the low elements as the result. Note that while this is
11937/// specified as a *right shift* because x86 is little-endian, it is a *left
11938/// rotate* of the vector lanes.
11940 ArrayRef<int> Mask) {
11941 // Don't accept any shuffles with zero elements.
11942 if (isAnyZero(Mask))
11943 return -1;
11944
11945 // PALIGNR works on 128-bit lanes.
11946 SmallVector<int, 16> RepeatedMask;
11947 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11948 return -1;
11949
11950 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11951 if (Rotation <= 0)
11952 return -1;
11953
11954 // PALIGNR rotates bytes, so we need to scale the
11955 // rotation based on how many bytes are in the vector lane.
11956 int NumElts = RepeatedMask.size();
11957 int Scale = 16 / NumElts;
11958 return Rotation * Scale;
11959}
11960
11962 SDValue V2, ArrayRef<int> Mask,
11963 const X86Subtarget &Subtarget,
11964 SelectionDAG &DAG) {
11965 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11966
11967 SDValue Lo = V1, Hi = V2;
11968 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11969 if (ByteRotation <= 0)
11970 return SDValue();
11971
11972 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11973 // PSLLDQ/PSRLDQ.
11974 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11975 Lo = DAG.getBitcast(ByteVT, Lo);
11976 Hi = DAG.getBitcast(ByteVT, Hi);
11977
11978 // SSSE3 targets can use the palignr instruction.
11979 if (Subtarget.hasSSSE3()) {
11980 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11981 "512-bit PALIGNR requires BWI instructions");
11982 return DAG.getBitcast(
11983 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11984 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11985 }
11986
11987 assert(VT.is128BitVector() &&
11988 "Rotate-based lowering only supports 128-bit lowering!");
11989 assert(Mask.size() <= 16 &&
11990 "Can shuffle at most 16 bytes in a 128-bit vector!");
11991 assert(ByteVT == MVT::v16i8 &&
11992 "SSE2 rotate lowering only needed for v16i8!");
11993
11994 // Default SSE2 implementation
11995 int LoByteShift = 16 - ByteRotation;
11996 int HiByteShift = ByteRotation;
11997
11998 SDValue LoShift =
11999 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12000 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12001 SDValue HiShift =
12002 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12003 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12004 return DAG.getBitcast(VT,
12005 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12006}
12007
12008/// Try to lower a vector shuffle as a dword/qword rotation.
12009///
12010/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12011/// rotation of the concatenation of two vectors; This routine will
12012/// try to generically lower a vector shuffle through such an pattern.
12013///
12014/// Essentially it concatenates V1 and V2, shifts right by some number of
12015/// elements, and takes the low elements as the result. Note that while this is
12016/// specified as a *right shift* because x86 is little-endian, it is a *left
12017/// rotate* of the vector lanes.
12019 SDValue V2, ArrayRef<int> Mask,
12020 const APInt &Zeroable,
12021 const X86Subtarget &Subtarget,
12022 SelectionDAG &DAG) {
12023 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12024 "Only 32-bit and 64-bit elements are supported!");
12025
12026 // 128/256-bit vectors are only supported with VLX.
12027 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12028 && "VLX required for 128/256-bit vectors");
12029
12030 SDValue Lo = V1, Hi = V2;
12031 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12032 if (0 < Rotation)
12033 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12034 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12035
12036 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12037 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12038 // TODO: We can probably make this more aggressive and use shift-pairs like
12039 // lowerShuffleAsByteShiftMask.
12040 unsigned NumElts = Mask.size();
12041 unsigned ZeroLo = Zeroable.countr_one();
12042 unsigned ZeroHi = Zeroable.countl_one();
12043 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12044 if (!ZeroLo && !ZeroHi)
12045 return SDValue();
12046
12047 if (ZeroLo) {
12048 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12049 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12050 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12051 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12052 getZeroVector(VT, Subtarget, DAG, DL),
12053 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12054 }
12055
12056 if (ZeroHi) {
12057 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12058 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12059 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12060 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12061 getZeroVector(VT, Subtarget, DAG, DL), Src,
12062 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12063 }
12064
12065 return SDValue();
12066}
12067
12068/// Try to lower a vector shuffle as a byte shift sequence.
12070 SDValue V2, ArrayRef<int> Mask,
12071 const APInt &Zeroable,
12072 const X86Subtarget &Subtarget,
12073 SelectionDAG &DAG) {
12074 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12075 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12076
12077 // We need a shuffle that has zeros at one/both ends and a sequential
12078 // shuffle from one source within.
12079 unsigned ZeroLo = Zeroable.countr_one();
12080 unsigned ZeroHi = Zeroable.countl_one();
12081 if (!ZeroLo && !ZeroHi)
12082 return SDValue();
12083
12084 unsigned NumElts = Mask.size();
12085 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12086 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12087 return SDValue();
12088
12089 unsigned Scale = VT.getScalarSizeInBits() / 8;
12090 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12091 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12092 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12093 return SDValue();
12094
12095 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12096 Res = DAG.getBitcast(MVT::v16i8, Res);
12097
12098 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12099 // inner sequential set of elements, possibly offset:
12100 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12101 // 01234567 --> 4567zzzz --> zzzzz456
12102 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12103 if (ZeroLo == 0) {
12104 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12105 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12106 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12107 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12108 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12109 } else if (ZeroHi == 0) {
12110 unsigned Shift = Mask[ZeroLo] % NumElts;
12111 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12112 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12113 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12114 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12115 } else if (!Subtarget.hasSSSE3()) {
12116 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12117 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12118 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12119 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12120 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12121 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12122 Shift += Mask[ZeroLo] % NumElts;
12123 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12124 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12125 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12127 } else
12128 return SDValue();
12129
12130 return DAG.getBitcast(VT, Res);
12131}
12132
12133/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12134///
12135/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12136/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12137/// matches elements from one of the input vectors shuffled to the left or
12138/// right with zeroable elements 'shifted in'. It handles both the strictly
12139/// bit-wise element shifts and the byte shift across an entire 128-bit double
12140/// quad word lane.
12141///
12142/// PSHL : (little-endian) left bit shift.
12143/// [ zz, 0, zz, 2 ]
12144/// [ -1, 4, zz, -1 ]
12145/// PSRL : (little-endian) right bit shift.
12146/// [ 1, zz, 3, zz]
12147/// [ -1, -1, 7, zz]
12148/// PSLLDQ : (little-endian) left byte shift
12149/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12150/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12151/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12152/// PSRLDQ : (little-endian) right byte shift
12153/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12154/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12155/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12156static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12157 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12158 int MaskOffset, const APInt &Zeroable,
12159 const X86Subtarget &Subtarget) {
12160 int Size = Mask.size();
12161 unsigned SizeInBits = Size * ScalarSizeInBits;
12162
12163 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12164 for (int i = 0; i < Size; i += Scale)
12165 for (int j = 0; j < Shift; ++j)
12166 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12167 return false;
12168
12169 return true;
12170 };
12171
12172 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12173 for (int i = 0; i != Size; i += Scale) {
12174 unsigned Pos = Left ? i + Shift : i;
12175 unsigned Low = Left ? i : i + Shift;
12176 unsigned Len = Scale - Shift;
12177 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12178 return -1;
12179 }
12180
12181 int ShiftEltBits = ScalarSizeInBits * Scale;
12182 bool ByteShift = ShiftEltBits > 64;
12183 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12184 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12185 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12186
12187 // Normalize the scale for byte shifts to still produce an i64 element
12188 // type.
12189 Scale = ByteShift ? Scale / 2 : Scale;
12190
12191 // We need to round trip through the appropriate type for the shift.
12192 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12193 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12194 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12195 return (int)ShiftAmt;
12196 };
12197
12198 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12199 // keep doubling the size of the integer elements up to that. We can
12200 // then shift the elements of the integer vector by whole multiples of
12201 // their width within the elements of the larger integer vector. Test each
12202 // multiple to see if we can find a match with the moved element indices
12203 // and that the shifted in elements are all zeroable.
12204 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12205 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12206 for (int Shift = 1; Shift != Scale; ++Shift)
12207 for (bool Left : {true, false})
12208 if (CheckZeros(Shift, Scale, Left)) {
12209 int ShiftAmt = MatchShift(Shift, Scale, Left);
12210 if (0 < ShiftAmt)
12211 return ShiftAmt;
12212 }
12213
12214 // no match
12215 return -1;
12216}
12217
12219 SDValue V2, ArrayRef<int> Mask,
12220 const APInt &Zeroable,
12221 const X86Subtarget &Subtarget,
12222 SelectionDAG &DAG, bool BitwiseOnly) {
12223 int Size = Mask.size();
12224 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12225
12226 MVT ShiftVT;
12227 SDValue V = V1;
12228 unsigned Opcode;
12229
12230 // Try to match shuffle against V1 shift.
12231 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12232 Mask, 0, Zeroable, Subtarget);
12233
12234 // If V1 failed, try to match shuffle against V2 shift.
12235 if (ShiftAmt < 0) {
12236 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12237 Mask, Size, Zeroable, Subtarget);
12238 V = V2;
12239 }
12240
12241 if (ShiftAmt < 0)
12242 return SDValue();
12243
12244 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12245 return SDValue();
12246
12247 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12248 "Illegal integer vector type");
12249 V = DAG.getBitcast(ShiftVT, V);
12250 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12251 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12252 return DAG.getBitcast(VT, V);
12253}
12254
12255// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12256// Remainder of lower half result is zero and upper half is all undef.
12257static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12258 ArrayRef<int> Mask, uint64_t &BitLen,
12259 uint64_t &BitIdx, const APInt &Zeroable) {
12260 int Size = Mask.size();
12261 int HalfSize = Size / 2;
12262 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12263 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12264
12265 // Upper half must be undefined.
12266 if (!isUndefUpperHalf(Mask))
12267 return false;
12268
12269 // Determine the extraction length from the part of the
12270 // lower half that isn't zeroable.
12271 int Len = HalfSize;
12272 for (; Len > 0; --Len)
12273 if (!Zeroable[Len - 1])
12274 break;
12275 assert(Len > 0 && "Zeroable shuffle mask");
12276
12277 // Attempt to match first Len sequential elements from the lower half.
12278 SDValue Src;
12279 int Idx = -1;
12280 for (int i = 0; i != Len; ++i) {
12281 int M = Mask[i];
12282 if (M == SM_SentinelUndef)
12283 continue;
12284 SDValue &V = (M < Size ? V1 : V2);
12285 M = M % Size;
12286
12287 // The extracted elements must start at a valid index and all mask
12288 // elements must be in the lower half.
12289 if (i > M || M >= HalfSize)
12290 return false;
12291
12292 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12293 Src = V;
12294 Idx = M - i;
12295 continue;
12296 }
12297 return false;
12298 }
12299
12300 if (!Src || Idx < 0)
12301 return false;
12302
12303 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12304 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12305 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12306 V1 = Src;
12307 return true;
12308}
12309
12310// INSERTQ: Extract lowest Len elements from lower half of second source and
12311// insert over first source, starting at Idx.
12312// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12313static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12314 ArrayRef<int> Mask, uint64_t &BitLen,
12315 uint64_t &BitIdx) {
12316 int Size = Mask.size();
12317 int HalfSize = Size / 2;
12318 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12319
12320 // Upper half must be undefined.
12321 if (!isUndefUpperHalf(Mask))
12322 return false;
12323
12324 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12325 SDValue Base;
12326
12327 // Attempt to match first source from mask before insertion point.
12328 if (isUndefInRange(Mask, 0, Idx)) {
12329 /* EMPTY */
12330 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12331 Base = V1;
12332 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12333 Base = V2;
12334 } else {
12335 continue;
12336 }
12337
12338 // Extend the extraction length looking to match both the insertion of
12339 // the second source and the remaining elements of the first.
12340 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12341 SDValue Insert;
12342 int Len = Hi - Idx;
12343
12344 // Match insertion.
12345 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12346 Insert = V1;
12347 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12348 Insert = V2;
12349 } else {
12350 continue;
12351 }
12352
12353 // Match the remaining elements of the lower half.
12354 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12355 /* EMPTY */
12356 } else if ((!Base || (Base == V1)) &&
12357 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12358 Base = V1;
12359 } else if ((!Base || (Base == V2)) &&
12360 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12361 Size + Hi)) {
12362 Base = V2;
12363 } else {
12364 continue;
12365 }
12366
12367 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12368 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12369 V1 = Base;
12370 V2 = Insert;
12371 return true;
12372 }
12373 }
12374
12375 return false;
12376}
12377
12378/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12380 SDValue V2, ArrayRef<int> Mask,
12381 const APInt &Zeroable, SelectionDAG &DAG) {
12382 uint64_t BitLen, BitIdx;
12383 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12384 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12385 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12386 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12387
12388 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12389 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12390 V2 ? V2 : DAG.getUNDEF(VT),
12391 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12392 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12393
12394 return SDValue();
12395}
12396
12397/// Lower a vector shuffle as an any/signed/zero extension.
12398///
12399/// Given a specific number of elements, element bit width, and extension
12400/// stride, produce either an extension based on the available
12401/// features of the subtarget. The extended elements are consecutive and
12402/// begin and can start from an offsetted element index in the input; to
12403/// avoid excess shuffling the offset must either being in the bottom lane
12404/// or at the start of a higher lane. All extended elements must be from
12405/// the same lane.
12407 int Scale, int Offset,
12408 unsigned ExtOpc, SDValue InputV,
12409 ArrayRef<int> Mask,
12410 const X86Subtarget &Subtarget,
12411 SelectionDAG &DAG) {
12412 assert(Scale > 1 && "Need a scale to extend.");
12413 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12414 int EltBits = VT.getScalarSizeInBits();
12415 int NumElements = VT.getVectorNumElements();
12416 int NumEltsPerLane = 128 / EltBits;
12417 int OffsetLane = Offset / NumEltsPerLane;
12418 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12419 "Only 8, 16, and 32 bit elements can be extended.");
12420 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12421 assert(0 <= Offset && "Extension offset must be positive.");
12422 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12423 "Extension offset must be in the first lane or start an upper lane.");
12424
12425 // Check that an index is in same lane as the base offset.
12426 auto SafeOffset = [&](int Idx) {
12427 return OffsetLane == (Idx / NumEltsPerLane);
12428 };
12429
12430 // Shift along an input so that the offset base moves to the first element.
12431 auto ShuffleOffset = [&](SDValue V) {
12432 if (!Offset)
12433 return V;
12434
12435 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12436 for (int i = 0; i * Scale < NumElements; ++i) {
12437 int SrcIdx = i + Offset;
12438 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12439 }
12440 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12441 };
12442
12443 // Found a valid a/zext mask! Try various lowering strategies based on the
12444 // input type and available ISA extensions.
12445 if (Subtarget.hasSSE41()) {
12446 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12447 // PUNPCK will catch this in a later shuffle match.
12448 if (Offset && Scale == 2 && VT.is128BitVector())
12449 return SDValue();
12450 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12451 NumElements / Scale);
12452 InputV = DAG.getBitcast(VT, InputV);
12453 InputV = ShuffleOffset(InputV);
12454 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12455 return DAG.getBitcast(VT, InputV);
12456 }
12457
12458 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12459 InputV = DAG.getBitcast(VT, InputV);
12460 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12461
12462 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12463 if (ExtOpc == ISD::SIGN_EXTEND)
12464 return SDValue();
12465
12466 // For any extends we can cheat for larger element sizes and use shuffle
12467 // instructions that can fold with a load and/or copy.
12468 if (AnyExt && EltBits == 32) {
12469 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12470 -1};
12471 return DAG.getBitcast(
12472 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12473 DAG.getBitcast(MVT::v4i32, InputV),
12474 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12475 }
12476 if (AnyExt && EltBits == 16 && Scale > 2) {
12477 int PSHUFDMask[4] = {Offset / 2, -1,
12478 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12479 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12480 DAG.getBitcast(MVT::v4i32, InputV),
12481 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12482 int PSHUFWMask[4] = {1, -1, -1, -1};
12483 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12484 return DAG.getBitcast(
12485 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12486 DAG.getBitcast(MVT::v8i16, InputV),
12487 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12488 }
12489
12490 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12491 // to 64-bits.
12492 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12493 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12494 assert(VT.is128BitVector() && "Unexpected vector width!");
12495
12496 int LoIdx = Offset * EltBits;
12497 SDValue Lo = DAG.getBitcast(
12498 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12499 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12500 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12501
12502 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12503 return DAG.getBitcast(VT, Lo);
12504
12505 int HiIdx = (Offset + 1) * EltBits;
12506 SDValue Hi = DAG.getBitcast(
12507 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12508 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12509 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12510 return DAG.getBitcast(VT,
12511 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12512 }
12513
12514 // If this would require more than 2 unpack instructions to expand, use
12515 // pshufb when available. We can only use more than 2 unpack instructions
12516 // when zero extending i8 elements which also makes it easier to use pshufb.
12517 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12518 assert(NumElements == 16 && "Unexpected byte vector width!");
12519 SDValue PSHUFBMask[16];
12520 for (int i = 0; i < 16; ++i) {
12521 int Idx = Offset + (i / Scale);
12522 if ((i % Scale == 0 && SafeOffset(Idx))) {
12523 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12524 continue;
12525 }
12526 PSHUFBMask[i] =
12527 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12528 }
12529 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12530 return DAG.getBitcast(
12531 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12532 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12533 }
12534
12535 // If we are extending from an offset, ensure we start on a boundary that
12536 // we can unpack from.
12537 int AlignToUnpack = Offset % (NumElements / Scale);
12538 if (AlignToUnpack) {
12539 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12540 for (int i = AlignToUnpack; i < NumElements; ++i)
12541 ShMask[i - AlignToUnpack] = i;
12542 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12543 Offset -= AlignToUnpack;
12544 }
12545
12546 // Otherwise emit a sequence of unpacks.
12547 do {
12548 unsigned UnpackLoHi = X86ISD::UNPCKL;
12549 if (Offset >= (NumElements / 2)) {
12550 UnpackLoHi = X86ISD::UNPCKH;
12551 Offset -= (NumElements / 2);
12552 }
12553
12554 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12555 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12556 : getZeroVector(InputVT, Subtarget, DAG, DL);
12557 InputV = DAG.getBitcast(InputVT, InputV);
12558 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12559 Scale /= 2;
12560 EltBits *= 2;
12561 NumElements /= 2;
12562 } while (Scale > 1);
12563 return DAG.getBitcast(VT, InputV);
12564}
12565
12566/// Try to lower a vector shuffle as a zero extension on any microarch.
12567///
12568/// This routine will try to do everything in its power to cleverly lower
12569/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12570/// check for the profitability of this lowering, it tries to aggressively
12571/// match this pattern. It will use all of the micro-architectural details it
12572/// can to emit an efficient lowering. It handles both blends with all-zero
12573/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12574/// masking out later).
12575///
12576/// The reason we have dedicated lowering for zext-style shuffles is that they
12577/// are both incredibly common and often quite performance sensitive.
12579 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12580 const APInt &Zeroable, const X86Subtarget &Subtarget,
12581 SelectionDAG &DAG) {
12582 int Bits = VT.getSizeInBits();
12583 int NumLanes = Bits / 128;
12584 int NumElements = VT.getVectorNumElements();
12585 int NumEltsPerLane = NumElements / NumLanes;
12586 assert(VT.getScalarSizeInBits() <= 32 &&
12587 "Exceeds 32-bit integer zero extension limit");
12588 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12589
12590 // Define a helper function to check a particular ext-scale and lower to it if
12591 // valid.
12592 auto Lower = [&](int Scale) -> SDValue {
12593 SDValue InputV;
12594 bool AnyExt = true;
12595 int Offset = 0;
12596 int Matches = 0;
12597 for (int i = 0; i < NumElements; ++i) {
12598 int M = Mask[i];
12599 if (M < 0)
12600 continue; // Valid anywhere but doesn't tell us anything.
12601 if (i % Scale != 0) {
12602 // Each of the extended elements need to be zeroable.
12603 if (!Zeroable[i])
12604 return SDValue();
12605
12606 // We no longer are in the anyext case.
12607 AnyExt = false;
12608 continue;
12609 }
12610
12611 // Each of the base elements needs to be consecutive indices into the
12612 // same input vector.
12613 SDValue V = M < NumElements ? V1 : V2;
12614 M = M % NumElements;
12615 if (!InputV) {
12616 InputV = V;
12617 Offset = M - (i / Scale);
12618 } else if (InputV != V)
12619 return SDValue(); // Flip-flopping inputs.
12620
12621 // Offset must start in the lowest 128-bit lane or at the start of an
12622 // upper lane.
12623 // FIXME: Is it ever worth allowing a negative base offset?
12624 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12625 (Offset % NumEltsPerLane) == 0))
12626 return SDValue();
12627
12628 // If we are offsetting, all referenced entries must come from the same
12629 // lane.
12630 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12631 return SDValue();
12632
12633 if ((M % NumElements) != (Offset + (i / Scale)))
12634 return SDValue(); // Non-consecutive strided elements.
12635 Matches++;
12636 }
12637
12638 // If we fail to find an input, we have a zero-shuffle which should always
12639 // have already been handled.
12640 // FIXME: Maybe handle this here in case during blending we end up with one?
12641 if (!InputV)
12642 return SDValue();
12643
12644 // If we are offsetting, don't extend if we only match a single input, we
12645 // can always do better by using a basic PSHUF or PUNPCK.
12646 if (Offset != 0 && Matches < 2)
12647 return SDValue();
12648
12649 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12650 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12651 InputV, Mask, Subtarget, DAG);
12652 };
12653
12654 // The widest scale possible for extending is to a 64-bit integer.
12655 assert(Bits % 64 == 0 &&
12656 "The number of bits in a vector must be divisible by 64 on x86!");
12657 int NumExtElements = Bits / 64;
12658
12659 // Each iteration, try extending the elements half as much, but into twice as
12660 // many elements.
12661 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12662 assert(NumElements % NumExtElements == 0 &&
12663 "The input vector size must be divisible by the extended size.");
12664 if (SDValue V = Lower(NumElements / NumExtElements))
12665 return V;
12666 }
12667
12668 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12669 if (Bits != 128)
12670 return SDValue();
12671
12672 // Returns one of the source operands if the shuffle can be reduced to a
12673 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12674 auto CanZExtLowHalf = [&]() {
12675 for (int i = NumElements / 2; i != NumElements; ++i)
12676 if (!Zeroable[i])
12677 return SDValue();
12678 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12679 return V1;
12680 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12681 return V2;
12682 return SDValue();
12683 };
12684
12685 if (SDValue V = CanZExtLowHalf()) {
12686 V = DAG.getBitcast(MVT::v2i64, V);
12687 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12688 return DAG.getBitcast(VT, V);
12689 }
12690
12691 // No viable ext lowering found.
12692 return SDValue();
12693}
12694
12695/// Try to get a scalar value for a specific element of a vector.
12696///
12697/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12699 SelectionDAG &DAG) {
12700 MVT VT = V.getSimpleValueType();
12701 MVT EltVT = VT.getVectorElementType();
12702 V = peekThroughBitcasts(V);
12703
12704 // If the bitcasts shift the element size, we can't extract an equivalent
12705 // element from it.
12706 MVT NewVT = V.getSimpleValueType();
12707 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12708 return SDValue();
12709
12710 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12711 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12712 // Ensure the scalar operand is the same size as the destination.
12713 // FIXME: Add support for scalar truncation where possible.
12714 SDValue S = V.getOperand(Idx);
12715 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12716 return DAG.getBitcast(EltVT, S);
12717 }
12718
12719 return SDValue();
12720}
12721
12722/// Helper to test for a load that can be folded with x86 shuffles.
12723///
12724/// This is particularly important because the set of instructions varies
12725/// significantly based on whether the operand is a load or not.
12727 return V.hasOneUse() &&
12729}
12730
12731template<typename T>
12732static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12733 T EltVT = VT.getScalarType();
12734 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12735 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12736}
12737
12738/// Try to lower insertion of a single element into a zero vector.
12739///
12740/// This is a common pattern that we have especially efficient patterns to lower
12741/// across all subtarget feature sets.
12743 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12744 const APInt &Zeroable, const X86Subtarget &Subtarget,
12745 SelectionDAG &DAG) {
12746 MVT ExtVT = VT;
12747 MVT EltVT = VT.getVectorElementType();
12748 unsigned NumElts = VT.getVectorNumElements();
12749 unsigned EltBits = VT.getScalarSizeInBits();
12750
12751 if (isSoftF16(EltVT, Subtarget))
12752 return SDValue();
12753
12754 int V2Index =
12755 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12756 Mask.begin();
12757 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12758 bool IsV1Zeroable = true;
12759 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12760 if (i != V2Index && !Zeroable[i]) {
12761 IsV1Zeroable = false;
12762 break;
12763 }
12764
12765 // Bail if a non-zero V1 isn't used in place.
12766 if (!IsV1Zeroable) {
12767 SmallVector<int, 8> V1Mask(Mask);
12768 V1Mask[V2Index] = -1;
12769 if (!isNoopShuffleMask(V1Mask))
12770 return SDValue();
12771 }
12772
12773 // Check for a single input from a SCALAR_TO_VECTOR node.
12774 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12775 // all the smarts here sunk into that routine. However, the current
12776 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12777 // vector shuffle lowering is dead.
12778 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12779 DAG);
12780 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12781 // We need to zext the scalar if it is smaller than an i32.
12782 V2S = DAG.getBitcast(EltVT, V2S);
12783 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12784 // Using zext to expand a narrow element won't work for non-zero
12785 // insertions. But we can use a masked constant vector if we're
12786 // inserting V2 into the bottom of V1.
12787 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12788 return SDValue();
12789
12790 // Zero-extend directly to i32.
12791 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12792 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12793
12794 // If we're inserting into a constant, mask off the inserted index
12795 // and OR with the zero-extended scalar.
12796 if (!IsV1Zeroable) {
12797 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12798 Bits[V2Index] = APInt::getZero(EltBits);
12799 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12800 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12801 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12802 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12803 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12804 }
12805 }
12806 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12807 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12808 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12809 // Either not inserting from the low element of the input or the input
12810 // element size is too small to use VZEXT_MOVL to clear the high bits.
12811 return SDValue();
12812 }
12813
12814 if (!IsV1Zeroable) {
12815 // If V1 can't be treated as a zero vector we have fewer options to lower
12816 // this. We can't support integer vectors or non-zero targets cheaply.
12817 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12818 if (!VT.isFloatingPoint() || V2Index != 0)
12819 return SDValue();
12820 if (!VT.is128BitVector())
12821 return SDValue();
12822
12823 // Otherwise, use MOVSD, MOVSS or MOVSH.
12824 unsigned MovOpc = 0;
12825 if (EltVT == MVT::f16)
12826 MovOpc = X86ISD::MOVSH;
12827 else if (EltVT == MVT::f32)
12828 MovOpc = X86ISD::MOVSS;
12829 else if (EltVT == MVT::f64)
12830 MovOpc = X86ISD::MOVSD;
12831 else
12832 llvm_unreachable("Unsupported floating point element type to handle!");
12833 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12834 }
12835
12836 // This lowering only works for the low element with floating point vectors.
12837 if (VT.isFloatingPoint() && V2Index != 0)
12838 return SDValue();
12839
12840 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12841 if (ExtVT != VT)
12842 V2 = DAG.getBitcast(VT, V2);
12843
12844 if (V2Index != 0) {
12845 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12846 // the desired position. Otherwise it is more efficient to do a vector
12847 // shift left. We know that we can do a vector shift left because all
12848 // the inputs are zero.
12849 if (VT.isFloatingPoint() || NumElts <= 4) {
12850 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12851 V2Shuffle[V2Index] = 0;
12852 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12853 } else {
12854 V2 = DAG.getBitcast(MVT::v16i8, V2);
12855 V2 = DAG.getNode(
12856 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12857 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12858 V2 = DAG.getBitcast(VT, V2);
12859 }
12860 }
12861 return V2;
12862}
12863
12864/// Try to lower broadcast of a single - truncated - integer element,
12865/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12866///
12867/// This assumes we have AVX2.
12869 int BroadcastIdx,
12870 const X86Subtarget &Subtarget,
12871 SelectionDAG &DAG) {
12872 assert(Subtarget.hasAVX2() &&
12873 "We can only lower integer broadcasts with AVX2!");
12874
12875 MVT EltVT = VT.getVectorElementType();
12876 MVT V0VT = V0.getSimpleValueType();
12877
12878 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12879 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12880
12881 MVT V0EltVT = V0VT.getVectorElementType();
12882 if (!V0EltVT.isInteger())
12883 return SDValue();
12884
12885 const unsigned EltSize = EltVT.getSizeInBits();
12886 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12887
12888 // This is only a truncation if the original element type is larger.
12889 if (V0EltSize <= EltSize)
12890 return SDValue();
12891
12892 assert(((V0EltSize % EltSize) == 0) &&
12893 "Scalar type sizes must all be powers of 2 on x86!");
12894
12895 const unsigned V0Opc = V0.getOpcode();
12896 const unsigned Scale = V0EltSize / EltSize;
12897 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12898
12899 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12900 V0Opc != ISD::BUILD_VECTOR)
12901 return SDValue();
12902
12903 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12904
12905 // If we're extracting non-least-significant bits, shift so we can truncate.
12906 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12907 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12908 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12909 if (const int OffsetIdx = BroadcastIdx % Scale)
12910 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12911 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12912
12913 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12914 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12915}
12916
12917/// Test whether this can be lowered with a single SHUFPS instruction.
12918///
12919/// This is used to disable more specialized lowerings when the shufps lowering
12920/// will happen to be efficient.
12922 // This routine only handles 128-bit shufps.
12923 assert(Mask.size() == 4 && "Unsupported mask size!");
12924 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12925 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12926 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12927 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12928
12929 // To lower with a single SHUFPS we need to have the low half and high half
12930 // each requiring a single input.
12931 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12932 return false;
12933 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12934 return false;
12935
12936 return true;
12937}
12938
12939/// Test whether the specified input (0 or 1) is in-place blended by the
12940/// given mask.
12941///
12942/// This returns true if the elements from a particular input are already in the
12943/// slot required by the given mask and require no permutation.
12945 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12946 int Size = Mask.size();
12947 for (int i = 0; i < Size; ++i)
12948 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12949 return false;
12950
12951 return true;
12952}
12953
12954/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12955/// the given mask.
12956///
12958 int BroadcastableElement = 0) {
12959 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12960 int Size = Mask.size();
12961 for (int i = 0; i < Size; ++i)
12962 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12963 Mask[i] % Size != BroadcastableElement)
12964 return false;
12965 return true;
12966}
12967
12968/// If we are extracting two 128-bit halves of a vector and shuffling the
12969/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12970/// multi-shuffle lowering.
12972 SDValue N1, ArrayRef<int> Mask,
12973 SelectionDAG &DAG) {
12974 MVT VT = N0.getSimpleValueType();
12975 assert((VT.is128BitVector() &&
12976 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12977 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12978
12979 // Check that both sources are extracts of the same source vector.
12980 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12982 N0.getOperand(0) != N1.getOperand(0) ||
12983 !N0.hasOneUse() || !N1.hasOneUse())
12984 return SDValue();
12985
12986 SDValue WideVec = N0.getOperand(0);
12987 MVT WideVT = WideVec.getSimpleValueType();
12988 if (!WideVT.is256BitVector())
12989 return SDValue();
12990
12991 // Match extracts of each half of the wide source vector. Commute the shuffle
12992 // if the extract of the low half is N1.
12993 unsigned NumElts = VT.getVectorNumElements();
12994 SmallVector<int, 4> NewMask(Mask);
12995 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12996 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12997 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12999 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13000 return SDValue();
13001
13002 // Final bailout: if the mask is simple, we are better off using an extract
13003 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13004 // because that avoids a constant load from memory.
13005 if (NumElts == 4 &&
13006 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13007 return SDValue();
13008
13009 // Extend the shuffle mask with undef elements.
13010 NewMask.append(NumElts, -1);
13011
13012 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13013 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13014 NewMask);
13015 // This is free: ymm -> xmm.
13016 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13017 DAG.getVectorIdxConstant(0, DL));
13018}
13019
13020/// Try to lower broadcast of a single element.
13021///
13022/// For convenience, this code also bundles all of the subtarget feature set
13023/// filtering. While a little annoying to re-dispatch on type here, there isn't
13024/// a convenient way to factor it out.
13026 SDValue V2, ArrayRef<int> Mask,
13027 const X86Subtarget &Subtarget,
13028 SelectionDAG &DAG) {
13029 MVT EltVT = VT.getVectorElementType();
13030 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13031 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13032 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13033 return SDValue();
13034
13035 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13036 // we can only broadcast from a register with AVX2.
13037 unsigned NumEltBits = VT.getScalarSizeInBits();
13038 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13041 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13042
13043 // Check that the mask is a broadcast.
13044 int BroadcastIdx = getSplatIndex(Mask);
13045 if (BroadcastIdx < 0) {
13046 // Check for hidden broadcast.
13047 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13048 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13049 return SDValue();
13050 BroadcastIdx = 0;
13051 }
13052 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13053 "a sorted mask where the broadcast "
13054 "comes from V1.");
13055 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13056
13057 // Go up the chain of (vector) values to find a scalar load that we can
13058 // combine with the broadcast.
13059 // TODO: Combine this logic with findEltLoadSrc() used by
13060 // EltsFromConsecutiveLoads().
13061 int BitOffset = BroadcastIdx * NumEltBits;
13062 SDValue V = V1;
13063 for (;;) {
13064 switch (V.getOpcode()) {
13065 case ISD::BITCAST: {
13066 V = V.getOperand(0);
13067 continue;
13068 }
13069 case ISD::CONCAT_VECTORS: {
13070 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13071 int OpIdx = BitOffset / OpBitWidth;
13072 V = V.getOperand(OpIdx);
13073 BitOffset %= OpBitWidth;
13074 continue;
13075 }
13077 // The extraction index adds to the existing offset.
13078 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13079 unsigned Idx = V.getConstantOperandVal(1);
13080 unsigned BeginOffset = Idx * EltBitWidth;
13081 BitOffset += BeginOffset;
13082 V = V.getOperand(0);
13083 continue;
13084 }
13085 case ISD::INSERT_SUBVECTOR: {
13086 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13087 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13088 int Idx = (int)V.getConstantOperandVal(2);
13089 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13090 int BeginOffset = Idx * EltBitWidth;
13091 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13092 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13093 BitOffset -= BeginOffset;
13094 V = VInner;
13095 } else {
13096 V = VOuter;
13097 }
13098 continue;
13099 }
13100 }
13101 break;
13102 }
13103 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13104 BroadcastIdx = BitOffset / NumEltBits;
13105
13106 // Do we need to bitcast the source to retrieve the original broadcast index?
13107 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13108
13109 // Check if this is a broadcast of a scalar. We special case lowering
13110 // for scalars so that we can more effectively fold with loads.
13111 // If the original value has a larger element type than the shuffle, the
13112 // broadcast element is in essence truncated. Make that explicit to ease
13113 // folding.
13114 if (BitCastSrc && VT.isInteger())
13115 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13116 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13117 return TruncBroadcast;
13118
13119 // Also check the simpler case, where we can directly reuse the scalar.
13120 if (!BitCastSrc &&
13121 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13122 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13123 V = V.getOperand(BroadcastIdx);
13124
13125 // If we can't broadcast from a register, check that the input is a load.
13126 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13127 return SDValue();
13128 } else if (ISD::isNormalLoad(V.getNode()) &&
13129 cast<LoadSDNode>(V)->isSimple()) {
13130 // We do not check for one-use of the vector load because a broadcast load
13131 // is expected to be a win for code size, register pressure, and possibly
13132 // uops even if the original vector load is not eliminated.
13133
13134 // Reduce the vector load and shuffle to a broadcasted scalar load.
13135 auto *Ld = cast<LoadSDNode>(V);
13136 SDValue BaseAddr = Ld->getBasePtr();
13137 MVT SVT = VT.getScalarType();
13138 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13139 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13140 SDValue NewAddr =
13142
13143 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13144 // than MOVDDUP.
13145 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13146 if (Opcode == X86ISD::VBROADCAST) {
13147 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13148 SDValue Ops[] = {Ld->getChain(), NewAddr};
13149 V = DAG.getMemIntrinsicNode(
13150 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13152 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13154 return DAG.getBitcast(VT, V);
13155 }
13156 assert(SVT == MVT::f64 && "Unexpected VT!");
13157 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13159 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13161 } else if (!BroadcastFromReg) {
13162 // We can't broadcast from a vector register.
13163 return SDValue();
13164 } else if (BitOffset != 0) {
13165 // We can only broadcast from the zero-element of a vector register,
13166 // but it can be advantageous to broadcast from the zero-element of a
13167 // subvector.
13168 if (!VT.is256BitVector() && !VT.is512BitVector())
13169 return SDValue();
13170
13171 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13172 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13173 return SDValue();
13174
13175 // If we are broadcasting an element from the lowest 128-bit subvector, try
13176 // to move the element in position.
13177 if (BitOffset < 128 && NumActiveElts > 1 &&
13178 V.getScalarValueSizeInBits() == NumEltBits) {
13179 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13180 "Unexpected bit-offset");
13181 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13182 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13183 V = extractSubVector(V, 0, DAG, DL, 128);
13184 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13185 } else {
13186 // Only broadcast the zero-element of a 128-bit subvector.
13187 if ((BitOffset % 128) != 0)
13188 return SDValue();
13189
13190 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13191 "Unexpected bit-offset");
13192 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13193 "Unexpected vector size");
13194 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13195 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13196 }
13197 }
13198
13199 // On AVX we can use VBROADCAST directly for scalar sources.
13200 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13201 V = DAG.getBitcast(MVT::f64, V);
13202 if (Subtarget.hasAVX()) {
13203 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13204 return DAG.getBitcast(VT, V);
13205 }
13206 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13207 }
13208
13209 // If this is a scalar, do the broadcast on this type and bitcast.
13210 if (!V.getValueType().isVector()) {
13211 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13212 "Unexpected scalar size");
13213 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13215 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13216 }
13217
13218 // We only support broadcasting from 128-bit vectors to minimize the
13219 // number of patterns we need to deal with in isel. So extract down to
13220 // 128-bits, removing as many bitcasts as possible.
13221 if (V.getValueSizeInBits() > 128)
13223
13224 // Otherwise cast V to a vector with the same element type as VT, but
13225 // possibly narrower than VT. Then perform the broadcast.
13226 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13227 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13228 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13229}
13230
13231// Check for whether we can use INSERTPS to perform the shuffle. We only use
13232// INSERTPS when the V1 elements are already in the correct locations
13233// because otherwise we can just always use two SHUFPS instructions which
13234// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13235// perform INSERTPS if a single V1 element is out of place and all V2
13236// elements are zeroable.
13238 unsigned &InsertPSMask,
13239 const APInt &Zeroable,
13240 ArrayRef<int> Mask, SelectionDAG &DAG) {
13241 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13242 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13243 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13244
13245 // Attempt to match INSERTPS with one element from VA or VB being
13246 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13247 // are updated.
13248 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13249 ArrayRef<int> CandidateMask) {
13250 unsigned ZMask = 0;
13251 int VADstIndex = -1;
13252 int VBDstIndex = -1;
13253 bool VAUsedInPlace = false;
13254
13255 for (int i = 0; i < 4; ++i) {
13256 // Synthesize a zero mask from the zeroable elements (includes undefs).
13257 if (Zeroable[i]) {
13258 ZMask |= 1 << i;
13259 continue;
13260 }
13261
13262 // Flag if we use any VA inputs in place.
13263 if (i == CandidateMask[i]) {
13264 VAUsedInPlace = true;
13265 continue;
13266 }
13267
13268 // We can only insert a single non-zeroable element.
13269 if (VADstIndex >= 0 || VBDstIndex >= 0)
13270 return false;
13271
13272 if (CandidateMask[i] < 4) {
13273 // VA input out of place for insertion.
13274 VADstIndex = i;
13275 } else {
13276 // VB input for insertion.
13277 VBDstIndex = i;
13278 }
13279 }
13280
13281 // Don't bother if we have no (non-zeroable) element for insertion.
13282 if (VADstIndex < 0 && VBDstIndex < 0)
13283 return false;
13284
13285 // Determine element insertion src/dst indices. The src index is from the
13286 // start of the inserted vector, not the start of the concatenated vector.
13287 unsigned VBSrcIndex = 0;
13288 if (VADstIndex >= 0) {
13289 // If we have a VA input out of place, we use VA as the V2 element
13290 // insertion and don't use the original V2 at all.
13291 VBSrcIndex = CandidateMask[VADstIndex];
13292 VBDstIndex = VADstIndex;
13293 VB = VA;
13294 } else {
13295 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13296 }
13297
13298 // If no V1 inputs are used in place, then the result is created only from
13299 // the zero mask and the V2 insertion - so remove V1 dependency.
13300 if (!VAUsedInPlace)
13301 VA = DAG.getUNDEF(MVT::v4f32);
13302
13303 // Update V1, V2 and InsertPSMask accordingly.
13304 V1 = VA;
13305 V2 = VB;
13306
13307 // Insert the V2 element into the desired position.
13308 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13309 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13310 return true;
13311 };
13312
13313 if (matchAsInsertPS(V1, V2, Mask))
13314 return true;
13315
13316 // Commute and try again.
13317 SmallVector<int, 4> CommutedMask(Mask);
13319 if (matchAsInsertPS(V2, V1, CommutedMask))
13320 return true;
13321
13322 return false;
13323}
13324
13326 ArrayRef<int> Mask, const APInt &Zeroable,
13327 SelectionDAG &DAG) {
13328 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13329 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13330
13331 // Attempt to match the insertps pattern.
13332 unsigned InsertPSMask = 0;
13333 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13334 return SDValue();
13335
13336 // Insert the V2 element into the desired position.
13337 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13338 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13339}
13340
13341/// Handle lowering of 2-lane 64-bit floating point shuffles.
13342///
13343/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13344/// support for floating point shuffles but not integer shuffles. These
13345/// instructions will incur a domain crossing penalty on some chips though so
13346/// it is better to avoid lowering through this for integer vectors where
13347/// possible.
13349 const APInt &Zeroable, SDValue V1, SDValue V2,
13350 const X86Subtarget &Subtarget,
13351 SelectionDAG &DAG) {
13352 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13353 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13354 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13355
13356 if (V2.isUndef()) {
13357 // Check for being able to broadcast a single element.
13358 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13359 Mask, Subtarget, DAG))
13360 return Broadcast;
13361
13362 // Straight shuffle of a single input vector. Simulate this by using the
13363 // single input as both of the "inputs" to this instruction..
13364 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13365
13366 if (Subtarget.hasAVX()) {
13367 // If we have AVX, we can use VPERMILPS which will allow folding a load
13368 // into the shuffle.
13369 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13370 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13371 }
13372
13373 return DAG.getNode(
13374 X86ISD::SHUFP, DL, MVT::v2f64,
13375 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13376 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13377 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13378 }
13379 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13380 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13381 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13382 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13383
13384 if (Subtarget.hasAVX2())
13385 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13386 return Extract;
13387
13388 // When loading a scalar and then shuffling it into a vector we can often do
13389 // the insertion cheaply.
13391 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13392 return Insertion;
13393 // Try inverting the insertion since for v2 masks it is easy to do and we
13394 // can't reliably sort the mask one way or the other.
13395 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13396 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13398 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13399 return Insertion;
13400
13401 // Try to use one of the special instruction patterns to handle two common
13402 // blend patterns if a zero-blend above didn't work.
13403 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13404 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13405 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13406 // We can either use a special instruction to load over the low double or
13407 // to move just the low double.
13408 return DAG.getNode(
13409 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13410 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13411
13412 if (Subtarget.hasSSE41())
13413 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13414 Zeroable, Subtarget, DAG))
13415 return Blend;
13416
13417 // Use dedicated unpack instructions for masks that match their pattern.
13418 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13419 return V;
13420
13421 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13422 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13423 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13424}
13425
13426/// Handle lowering of 2-lane 64-bit integer shuffles.
13427///
13428/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13429/// the integer unit to minimize domain crossing penalties. However, for blends
13430/// it falls back to the floating point shuffle operation with appropriate bit
13431/// casting.
13433 const APInt &Zeroable, SDValue V1, SDValue V2,
13434 const X86Subtarget &Subtarget,
13435 SelectionDAG &DAG) {
13436 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13437 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13438 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13439
13440 if (V2.isUndef()) {
13441 // Check for being able to broadcast a single element.
13442 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13443 Mask, Subtarget, DAG))
13444 return Broadcast;
13445
13446 // Straight shuffle of a single input vector. For everything from SSE2
13447 // onward this has a single fast instruction with no scary immediates.
13448 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13449 V1 = DAG.getBitcast(MVT::v4i32, V1);
13450 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13451 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13452 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13453 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13454 return DAG.getBitcast(
13455 MVT::v2i64,
13456 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13457 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13458 }
13459 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13460 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13461 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13462 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13463
13464 if (Subtarget.hasAVX2())
13465 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13466 return Extract;
13467
13468 // Try to use shift instructions.
13469 if (SDValue Shift =
13470 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13471 DAG, /*BitwiseOnly*/ false))
13472 return Shift;
13473
13474 // When loading a scalar and then shuffling it into a vector we can often do
13475 // the insertion cheaply.
13477 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13478 return Insertion;
13479 // Try inverting the insertion since for v2 masks it is easy to do and we
13480 // can't reliably sort the mask one way or the other.
13481 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13483 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13484 return Insertion;
13485
13486 // We have different paths for blend lowering, but they all must use the
13487 // *exact* same predicate.
13488 bool IsBlendSupported = Subtarget.hasSSE41();
13489 if (IsBlendSupported)
13490 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13491 Zeroable, Subtarget, DAG))
13492 return Blend;
13493
13494 // Use dedicated unpack instructions for masks that match their pattern.
13495 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13496 return V;
13497
13498 // Try to use byte rotation instructions.
13499 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13500 if (Subtarget.hasSSSE3()) {
13501 if (Subtarget.hasVLX())
13502 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13503 Zeroable, Subtarget, DAG))
13504 return Rotate;
13505
13506 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13507 Subtarget, DAG))
13508 return Rotate;
13509 }
13510
13511 // If we have direct support for blends, we should lower by decomposing into
13512 // a permute. That will be faster than the domain cross.
13513 if (IsBlendSupported)
13514 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13515 Zeroable, Subtarget, DAG);
13516
13517 // We implement this with SHUFPD which is pretty lame because it will likely
13518 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13519 // However, all the alternatives are still more cycles and newer chips don't
13520 // have this problem. It would be really nice if x86 had better shuffles here.
13521 V1 = DAG.getBitcast(MVT::v2f64, V1);
13522 V2 = DAG.getBitcast(MVT::v2f64, V2);
13523 return DAG.getBitcast(MVT::v2i64,
13524 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13525}
13526
13527/// Lower a vector shuffle using the SHUFPS instruction.
13528///
13529/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13530/// It makes no assumptions about whether this is the *best* lowering, it simply
13531/// uses it.
13533 ArrayRef<int> Mask, SDValue V1,
13534 SDValue V2, SelectionDAG &DAG) {
13535 SDValue LowV = V1, HighV = V2;
13536 SmallVector<int, 4> NewMask(Mask);
13537 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13538
13539 if (NumV2Elements == 1) {
13540 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13541
13542 // Compute the index adjacent to V2Index and in the same half by toggling
13543 // the low bit.
13544 int V2AdjIndex = V2Index ^ 1;
13545
13546 if (Mask[V2AdjIndex] < 0) {
13547 // Handles all the cases where we have a single V2 element and an undef.
13548 // This will only ever happen in the high lanes because we commute the
13549 // vector otherwise.
13550 if (V2Index < 2)
13551 std::swap(LowV, HighV);
13552 NewMask[V2Index] -= 4;
13553 } else {
13554 // Handle the case where the V2 element ends up adjacent to a V1 element.
13555 // To make this work, blend them together as the first step.
13556 int V1Index = V2AdjIndex;
13557 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13558 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13559 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13560
13561 // Now proceed to reconstruct the final blend as we have the necessary
13562 // high or low half formed.
13563 if (V2Index < 2) {
13564 LowV = V2;
13565 HighV = V1;
13566 } else {
13567 HighV = V2;
13568 }
13569 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13570 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13571 }
13572 } else if (NumV2Elements == 2) {
13573 if (Mask[0] < 4 && Mask[1] < 4) {
13574 // Handle the easy case where we have V1 in the low lanes and V2 in the
13575 // high lanes.
13576 NewMask[2] -= 4;
13577 NewMask[3] -= 4;
13578 } else if (Mask[2] < 4 && Mask[3] < 4) {
13579 // We also handle the reversed case because this utility may get called
13580 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13581 // arrange things in the right direction.
13582 NewMask[0] -= 4;
13583 NewMask[1] -= 4;
13584 HighV = V1;
13585 LowV = V2;
13586 } else {
13587 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13588 // trying to place elements directly, just blend them and set up the final
13589 // shuffle to place them.
13590
13591 // The first two blend mask elements are for V1, the second two are for
13592 // V2.
13593 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13594 Mask[2] < 4 ? Mask[2] : Mask[3],
13595 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13596 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13597 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13598 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13599
13600 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13601 // a blend.
13602 LowV = HighV = V1;
13603 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13604 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13605 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13606 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13607 }
13608 } else if (NumV2Elements == 3) {
13609 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13610 // we can get here due to other paths (e.g repeated mask matching) that we
13611 // don't want to do another round of lowerVECTOR_SHUFFLE.
13613 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13614 }
13615 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13616 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13617}
13618
13619/// Lower 4-lane 32-bit floating point shuffles.
13620///
13621/// Uses instructions exclusively from the floating point unit to minimize
13622/// domain crossing penalties, as these are sufficient to implement all v4f32
13623/// shuffles.
13625 const APInt &Zeroable, SDValue V1, SDValue V2,
13626 const X86Subtarget &Subtarget,
13627 SelectionDAG &DAG) {
13628 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13629 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13630 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13631
13632 if (Subtarget.hasSSE41())
13633 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13634 Zeroable, Subtarget, DAG))
13635 return Blend;
13636
13637 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13638
13639 if (NumV2Elements == 0) {
13640 // Check for being able to broadcast a single element.
13641 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13642 Mask, Subtarget, DAG))
13643 return Broadcast;
13644
13645 // Use even/odd duplicate instructions for masks that match their pattern.
13646 if (Subtarget.hasSSE3()) {
13647 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13648 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13649 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13650 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13651 }
13652
13653 if (Subtarget.hasAVX()) {
13654 // If we have AVX, we can use VPERMILPS which will allow folding a load
13655 // into the shuffle.
13656 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13657 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13658 }
13659
13660 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13661 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13662 if (!Subtarget.hasSSE2()) {
13663 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13664 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13665 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13666 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13667 }
13668
13669 // Otherwise, use a straight shuffle of a single input vector. We pass the
13670 // input vector to both operands to simulate this with a SHUFPS.
13671 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13672 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13673 }
13674
13675 if (Subtarget.hasSSE2())
13677 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13678 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13679 return ZExt;
13680 }
13681
13682 if (Subtarget.hasAVX2())
13683 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13684 return Extract;
13685
13686 // There are special ways we can lower some single-element blends. However, we
13687 // have custom ways we can lower more complex single-element blends below that
13688 // we defer to if both this and BLENDPS fail to match, so restrict this to
13689 // when the V2 input is targeting element 0 of the mask -- that is the fast
13690 // case here.
13691 if (NumV2Elements == 1 && Mask[0] >= 4)
13693 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13694 return V;
13695
13696 if (Subtarget.hasSSE41()) {
13697 // Use INSERTPS if we can complete the shuffle efficiently.
13698 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13699 return V;
13700
13701 if (!isSingleSHUFPSMask(Mask))
13702 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13703 V2, Mask, DAG))
13704 return BlendPerm;
13705 }
13706
13707 // Use low/high mov instructions. These are only valid in SSE1 because
13708 // otherwise they are widened to v2f64 and never get here.
13709 if (!Subtarget.hasSSE2()) {
13710 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13711 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13712 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13713 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13714 }
13715
13716 // Use dedicated unpack instructions for masks that match their pattern.
13717 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13718 return V;
13719
13720 // Otherwise fall back to a SHUFPS lowering strategy.
13721 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13722}
13723
13724/// Lower 4-lane i32 vector shuffles.
13725///
13726/// We try to handle these with integer-domain shuffles where we can, but for
13727/// blends we use the floating point domain blend instructions.
13729 const APInt &Zeroable, SDValue V1, SDValue V2,
13730 const X86Subtarget &Subtarget,
13731 SelectionDAG &DAG) {
13732 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13733 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13734 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13735
13736 // Whenever we can lower this as a zext, that instruction is strictly faster
13737 // than any alternative. It also allows us to fold memory operands into the
13738 // shuffle in many cases.
13739 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13740 Zeroable, Subtarget, DAG))
13741 return ZExt;
13742
13743 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13744
13745 // Try to use shift instructions if fast.
13746 if (Subtarget.preferLowerShuffleAsShift()) {
13747 if (SDValue Shift =
13748 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13749 Subtarget, DAG, /*BitwiseOnly*/ true))
13750 return Shift;
13751 if (NumV2Elements == 0)
13752 if (SDValue Rotate =
13753 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13754 return Rotate;
13755 }
13756
13757 if (NumV2Elements == 0) {
13758 // Try to use broadcast unless the mask only has one non-undef element.
13759 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13760 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13761 Mask, Subtarget, DAG))
13762 return Broadcast;
13763 }
13764
13765 // Straight shuffle of a single input vector. For everything from SSE2
13766 // onward this has a single fast instruction with no scary immediates.
13767 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13768 // but we aren't actually going to use the UNPCK instruction because doing
13769 // so prevents folding a load into this instruction or making a copy.
13770 const int UnpackLoMask[] = {0, 0, 1, 1};
13771 const int UnpackHiMask[] = {2, 2, 3, 3};
13772 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13773 Mask = UnpackLoMask;
13774 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13775 Mask = UnpackHiMask;
13776
13777 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13778 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13779 }
13780
13781 if (Subtarget.hasAVX2())
13782 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13783 return Extract;
13784
13785 // Try to use shift instructions.
13786 if (SDValue Shift =
13787 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13788 DAG, /*BitwiseOnly*/ false))
13789 return Shift;
13790
13791 // There are special ways we can lower some single-element blends.
13792 if (NumV2Elements == 1)
13794 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13795 return V;
13796
13797 // We have different paths for blend lowering, but they all must use the
13798 // *exact* same predicate.
13799 bool IsBlendSupported = Subtarget.hasSSE41();
13800 if (IsBlendSupported)
13801 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13802 Zeroable, Subtarget, DAG))
13803 return Blend;
13804
13805 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13806 Zeroable, Subtarget, DAG))
13807 return Masked;
13808
13809 // Use dedicated unpack instructions for masks that match their pattern.
13810 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13811 return V;
13812
13813 // Try to use byte rotation instructions.
13814 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13815 if (Subtarget.hasSSSE3()) {
13816 if (Subtarget.hasVLX())
13817 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13818 Zeroable, Subtarget, DAG))
13819 return Rotate;
13820
13821 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13822 Subtarget, DAG))
13823 return Rotate;
13824 }
13825
13826 // Assume that a single SHUFPS is faster than an alternative sequence of
13827 // multiple instructions (even if the CPU has a domain penalty).
13828 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13829 if (!isSingleSHUFPSMask(Mask)) {
13830 // If we have direct support for blends, we should lower by decomposing into
13831 // a permute. That will be faster than the domain cross.
13832 if (IsBlendSupported)
13833 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13834 Zeroable, Subtarget, DAG);
13835
13836 // Try to lower by permuting the inputs into an unpack instruction.
13837 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13838 Mask, Subtarget, DAG))
13839 return Unpack;
13840 }
13841
13842 // We implement this with SHUFPS because it can blend from two vectors.
13843 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13844 // up the inputs, bypassing domain shift penalties that we would incur if we
13845 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13846 // relevant.
13847 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13848 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13849 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13850 return DAG.getBitcast(MVT::v4i32, ShufPS);
13851}
13852
13853/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13854/// shuffle lowering, and the most complex part.
13855///
13856/// The lowering strategy is to try to form pairs of input lanes which are
13857/// targeted at the same half of the final vector, and then use a dword shuffle
13858/// to place them onto the right half, and finally unpack the paired lanes into
13859/// their final position.
13860///
13861/// The exact breakdown of how to form these dword pairs and align them on the
13862/// correct sides is really tricky. See the comments within the function for
13863/// more of the details.
13864///
13865/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13866/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13867/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13868/// vector, form the analogous 128-bit 8-element Mask.
13870 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13871 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13872 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13873 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13874
13875 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13876 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13877 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13878
13879 // Attempt to directly match PSHUFLW or PSHUFHW.
13880 if (isUndefOrInRange(LoMask, 0, 4) &&
13881 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13882 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13883 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13884 }
13885 if (isUndefOrInRange(HiMask, 4, 8) &&
13886 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13887 for (int i = 0; i != 4; ++i)
13888 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13889 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13890 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13891 }
13892
13893 SmallVector<int, 4> LoInputs;
13894 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13895 array_pod_sort(LoInputs.begin(), LoInputs.end());
13896 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13897 SmallVector<int, 4> HiInputs;
13898 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13899 array_pod_sort(HiInputs.begin(), HiInputs.end());
13900 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13901 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13902 int NumHToL = LoInputs.size() - NumLToL;
13903 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13904 int NumHToH = HiInputs.size() - NumLToH;
13905 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13906 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13907 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13908 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13909
13910 // If we are shuffling values from one half - check how many different DWORD
13911 // pairs we need to create. If only 1 or 2 then we can perform this as a
13912 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13913 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13914 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13915 V = DAG.getNode(ShufWOp, DL, VT, V,
13916 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13917 V = DAG.getBitcast(PSHUFDVT, V);
13918 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13919 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13920 return DAG.getBitcast(VT, V);
13921 };
13922
13923 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13924 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13925 SmallVector<std::pair<int, int>, 4> DWordPairs;
13926 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13927
13928 // Collect the different DWORD pairs.
13929 for (int DWord = 0; DWord != 4; ++DWord) {
13930 int M0 = Mask[2 * DWord + 0];
13931 int M1 = Mask[2 * DWord + 1];
13932 M0 = (M0 >= 0 ? M0 % 4 : M0);
13933 M1 = (M1 >= 0 ? M1 % 4 : M1);
13934 if (M0 < 0 && M1 < 0)
13935 continue;
13936
13937 bool Match = false;
13938 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13939 auto &DWordPair = DWordPairs[j];
13940 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13941 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13942 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13943 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13944 PSHUFDMask[DWord] = DOffset + j;
13945 Match = true;
13946 break;
13947 }
13948 }
13949 if (!Match) {
13950 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13951 DWordPairs.push_back(std::make_pair(M0, M1));
13952 }
13953 }
13954
13955 if (DWordPairs.size() <= 2) {
13956 DWordPairs.resize(2, std::make_pair(-1, -1));
13957 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13958 DWordPairs[1].first, DWordPairs[1].second};
13959 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13960 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13961 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13962 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13963 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13964 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13965 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13966 }
13967 if ((NumHToL + NumHToH) == 0)
13968 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13969 if ((NumLToL + NumLToH) == 0)
13970 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13971 }
13972 }
13973
13974 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13975 // such inputs we can swap two of the dwords across the half mark and end up
13976 // with <=2 inputs to each half in each half. Once there, we can fall through
13977 // to the generic code below. For example:
13978 //
13979 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13980 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13981 //
13982 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13983 // and an existing 2-into-2 on the other half. In this case we may have to
13984 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13985 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13986 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13987 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13988 // half than the one we target for fixing) will be fixed when we re-enter this
13989 // path. We will also combine away any sequence of PSHUFD instructions that
13990 // result into a single instruction. Here is an example of the tricky case:
13991 //
13992 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13993 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13994 //
13995 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13996 //
13997 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13998 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13999 //
14000 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14001 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14002 //
14003 // The result is fine to be handled by the generic logic.
14004 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14005 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14006 int AOffset, int BOffset) {
14007 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14008 "Must call this with A having 3 or 1 inputs from the A half.");
14009 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14010 "Must call this with B having 1 or 3 inputs from the B half.");
14011 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14012 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14013
14014 bool ThreeAInputs = AToAInputs.size() == 3;
14015
14016 // Compute the index of dword with only one word among the three inputs in
14017 // a half by taking the sum of the half with three inputs and subtracting
14018 // the sum of the actual three inputs. The difference is the remaining
14019 // slot.
14020 int ADWord = 0, BDWord = 0;
14021 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14022 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14023 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14024 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14025 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14026 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14027 int TripleNonInputIdx =
14028 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14029 TripleDWord = TripleNonInputIdx / 2;
14030
14031 // We use xor with one to compute the adjacent DWord to whichever one the
14032 // OneInput is in.
14033 OneInputDWord = (OneInput / 2) ^ 1;
14034
14035 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14036 // and BToA inputs. If there is also such a problem with the BToB and AToB
14037 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14038 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14039 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14040 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14041 // Compute how many inputs will be flipped by swapping these DWords. We
14042 // need
14043 // to balance this to ensure we don't form a 3-1 shuffle in the other
14044 // half.
14045 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14046 llvm::count(AToBInputs, 2 * ADWord + 1);
14047 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14048 llvm::count(BToBInputs, 2 * BDWord + 1);
14049 if ((NumFlippedAToBInputs == 1 &&
14050 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14051 (NumFlippedBToBInputs == 1 &&
14052 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14053 // We choose whether to fix the A half or B half based on whether that
14054 // half has zero flipped inputs. At zero, we may not be able to fix it
14055 // with that half. We also bias towards fixing the B half because that
14056 // will more commonly be the high half, and we have to bias one way.
14057 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14058 ArrayRef<int> Inputs) {
14059 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14060 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14061 // Determine whether the free index is in the flipped dword or the
14062 // unflipped dword based on where the pinned index is. We use this bit
14063 // in an xor to conditionally select the adjacent dword.
14064 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14065 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14066 if (IsFixIdxInput == IsFixFreeIdxInput)
14067 FixFreeIdx += 1;
14068 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14069 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14070 "We need to be changing the number of flipped inputs!");
14071 int PSHUFHalfMask[] = {0, 1, 2, 3};
14072 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14073 V = DAG.getNode(
14074 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14075 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14076 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14077
14078 for (int &M : Mask)
14079 if (M >= 0 && M == FixIdx)
14080 M = FixFreeIdx;
14081 else if (M >= 0 && M == FixFreeIdx)
14082 M = FixIdx;
14083 };
14084 if (NumFlippedBToBInputs != 0) {
14085 int BPinnedIdx =
14086 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14087 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14088 } else {
14089 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14090 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14091 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14092 }
14093 }
14094 }
14095
14096 int PSHUFDMask[] = {0, 1, 2, 3};
14097 PSHUFDMask[ADWord] = BDWord;
14098 PSHUFDMask[BDWord] = ADWord;
14099 V = DAG.getBitcast(
14100 VT,
14101 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14102 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14103
14104 // Adjust the mask to match the new locations of A and B.
14105 for (int &M : Mask)
14106 if (M >= 0 && M/2 == ADWord)
14107 M = 2 * BDWord + M % 2;
14108 else if (M >= 0 && M/2 == BDWord)
14109 M = 2 * ADWord + M % 2;
14110
14111 // Recurse back into this routine to re-compute state now that this isn't
14112 // a 3 and 1 problem.
14113 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14114 };
14115 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14116 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14117 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14118 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14119
14120 // At this point there are at most two inputs to the low and high halves from
14121 // each half. That means the inputs can always be grouped into dwords and
14122 // those dwords can then be moved to the correct half with a dword shuffle.
14123 // We use at most one low and one high word shuffle to collect these paired
14124 // inputs into dwords, and finally a dword shuffle to place them.
14125 int PSHUFLMask[4] = {-1, -1, -1, -1};
14126 int PSHUFHMask[4] = {-1, -1, -1, -1};
14127 int PSHUFDMask[4] = {-1, -1, -1, -1};
14128
14129 // First fix the masks for all the inputs that are staying in their
14130 // original halves. This will then dictate the targets of the cross-half
14131 // shuffles.
14132 auto fixInPlaceInputs =
14133 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14134 MutableArrayRef<int> SourceHalfMask,
14135 MutableArrayRef<int> HalfMask, int HalfOffset) {
14136 if (InPlaceInputs.empty())
14137 return;
14138 if (InPlaceInputs.size() == 1) {
14139 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14140 InPlaceInputs[0] - HalfOffset;
14141 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14142 return;
14143 }
14144 if (IncomingInputs.empty()) {
14145 // Just fix all of the in place inputs.
14146 for (int Input : InPlaceInputs) {
14147 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14148 PSHUFDMask[Input / 2] = Input / 2;
14149 }
14150 return;
14151 }
14152
14153 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14154 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14155 InPlaceInputs[0] - HalfOffset;
14156 // Put the second input next to the first so that they are packed into
14157 // a dword. We find the adjacent index by toggling the low bit.
14158 int AdjIndex = InPlaceInputs[0] ^ 1;
14159 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14160 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14161 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14162 };
14163 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14164 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14165
14166 // Now gather the cross-half inputs and place them into a free dword of
14167 // their target half.
14168 // FIXME: This operation could almost certainly be simplified dramatically to
14169 // look more like the 3-1 fixing operation.
14170 auto moveInputsToRightHalf = [&PSHUFDMask](
14171 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14172 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14173 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14174 int DestOffset) {
14175 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14176 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14177 };
14178 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14179 int Word) {
14180 int LowWord = Word & ~1;
14181 int HighWord = Word | 1;
14182 return isWordClobbered(SourceHalfMask, LowWord) ||
14183 isWordClobbered(SourceHalfMask, HighWord);
14184 };
14185
14186 if (IncomingInputs.empty())
14187 return;
14188
14189 if (ExistingInputs.empty()) {
14190 // Map any dwords with inputs from them into the right half.
14191 for (int Input : IncomingInputs) {
14192 // If the source half mask maps over the inputs, turn those into
14193 // swaps and use the swapped lane.
14194 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14195 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14196 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14197 Input - SourceOffset;
14198 // We have to swap the uses in our half mask in one sweep.
14199 for (int &M : HalfMask)
14200 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14201 M = Input;
14202 else if (M == Input)
14203 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14204 } else {
14205 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14206 Input - SourceOffset &&
14207 "Previous placement doesn't match!");
14208 }
14209 // Note that this correctly re-maps both when we do a swap and when
14210 // we observe the other side of the swap above. We rely on that to
14211 // avoid swapping the members of the input list directly.
14212 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14213 }
14214
14215 // Map the input's dword into the correct half.
14216 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14217 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14218 else
14219 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14220 Input / 2 &&
14221 "Previous placement doesn't match!");
14222 }
14223
14224 // And just directly shift any other-half mask elements to be same-half
14225 // as we will have mirrored the dword containing the element into the
14226 // same position within that half.
14227 for (int &M : HalfMask)
14228 if (M >= SourceOffset && M < SourceOffset + 4) {
14229 M = M - SourceOffset + DestOffset;
14230 assert(M >= 0 && "This should never wrap below zero!");
14231 }
14232 return;
14233 }
14234
14235 // Ensure we have the input in a viable dword of its current half. This
14236 // is particularly tricky because the original position may be clobbered
14237 // by inputs being moved and *staying* in that half.
14238 if (IncomingInputs.size() == 1) {
14239 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14240 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14241 SourceOffset;
14242 SourceHalfMask[InputFixed - SourceOffset] =
14243 IncomingInputs[0] - SourceOffset;
14244 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14245 IncomingInputs[0] = InputFixed;
14246 }
14247 } else if (IncomingInputs.size() == 2) {
14248 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14249 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14250 // We have two non-adjacent or clobbered inputs we need to extract from
14251 // the source half. To do this, we need to map them into some adjacent
14252 // dword slot in the source mask.
14253 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14254 IncomingInputs[1] - SourceOffset};
14255
14256 // If there is a free slot in the source half mask adjacent to one of
14257 // the inputs, place the other input in it. We use (Index XOR 1) to
14258 // compute an adjacent index.
14259 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14260 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14261 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14262 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14263 InputsFixed[1] = InputsFixed[0] ^ 1;
14264 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14265 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14266 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14267 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14268 InputsFixed[0] = InputsFixed[1] ^ 1;
14269 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14270 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14271 // The two inputs are in the same DWord but it is clobbered and the
14272 // adjacent DWord isn't used at all. Move both inputs to the free
14273 // slot.
14274 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14275 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14276 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14277 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14278 } else {
14279 // The only way we hit this point is if there is no clobbering
14280 // (because there are no off-half inputs to this half) and there is no
14281 // free slot adjacent to one of the inputs. In this case, we have to
14282 // swap an input with a non-input.
14283 for (int i = 0; i < 4; ++i)
14284 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14285 "We can't handle any clobbers here!");
14286 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14287 "Cannot have adjacent inputs here!");
14288
14289 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14290 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14291
14292 // We also have to update the final source mask in this case because
14293 // it may need to undo the above swap.
14294 for (int &M : FinalSourceHalfMask)
14295 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14296 M = InputsFixed[1] + SourceOffset;
14297 else if (M == InputsFixed[1] + SourceOffset)
14298 M = (InputsFixed[0] ^ 1) + SourceOffset;
14299
14300 InputsFixed[1] = InputsFixed[0] ^ 1;
14301 }
14302
14303 // Point everything at the fixed inputs.
14304 for (int &M : HalfMask)
14305 if (M == IncomingInputs[0])
14306 M = InputsFixed[0] + SourceOffset;
14307 else if (M == IncomingInputs[1])
14308 M = InputsFixed[1] + SourceOffset;
14309
14310 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14311 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14312 }
14313 } else {
14314 llvm_unreachable("Unhandled input size!");
14315 }
14316
14317 // Now hoist the DWord down to the right half.
14318 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14319 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14320 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14321 for (int &M : HalfMask)
14322 for (int Input : IncomingInputs)
14323 if (M == Input)
14324 M = FreeDWord * 2 + Input % 2;
14325 };
14326 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14327 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14328 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14329 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14330
14331 // Now enact all the shuffles we've computed to move the inputs into their
14332 // target half.
14333 if (!isNoopShuffleMask(PSHUFLMask))
14334 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14335 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14336 if (!isNoopShuffleMask(PSHUFHMask))
14337 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14338 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14339 if (!isNoopShuffleMask(PSHUFDMask))
14340 V = DAG.getBitcast(
14341 VT,
14342 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14343 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14344
14345 // At this point, each half should contain all its inputs, and we can then
14346 // just shuffle them into their final position.
14347 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14348 "Failed to lift all the high half inputs to the low mask!");
14349 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14350 "Failed to lift all the low half inputs to the high mask!");
14351
14352 // Do a half shuffle for the low mask.
14353 if (!isNoopShuffleMask(LoMask))
14354 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14355 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14356
14357 // Do a half shuffle with the high mask after shifting its values down.
14358 for (int &M : HiMask)
14359 if (M >= 0)
14360 M -= 4;
14361 if (!isNoopShuffleMask(HiMask))
14362 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14363 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14364
14365 return V;
14366}
14367
14368/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14369/// blend if only one input is used.
14371 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14372 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14374 "Lane crossing shuffle masks not supported");
14375
14376 int NumBytes = VT.getSizeInBits() / 8;
14377 int Size = Mask.size();
14378 int Scale = NumBytes / Size;
14379
14380 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14381 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14382 V1InUse = false;
14383 V2InUse = false;
14384
14385 for (int i = 0; i < NumBytes; ++i) {
14386 int M = Mask[i / Scale];
14387 if (M < 0)
14388 continue;
14389
14390 const int ZeroMask = 0x80;
14391 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14392 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14393 if (Zeroable[i / Scale])
14394 V1Idx = V2Idx = ZeroMask;
14395
14396 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14397 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14398 V1InUse |= (ZeroMask != V1Idx);
14399 V2InUse |= (ZeroMask != V2Idx);
14400 }
14401
14402 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14403 if (V1InUse)
14404 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14405 DAG.getBuildVector(ShufVT, DL, V1Mask));
14406 if (V2InUse)
14407 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14408 DAG.getBuildVector(ShufVT, DL, V2Mask));
14409
14410 // If we need shuffled inputs from both, blend the two.
14411 SDValue V;
14412 if (V1InUse && V2InUse)
14413 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14414 else
14415 V = V1InUse ? V1 : V2;
14416
14417 // Cast the result back to the correct type.
14418 return DAG.getBitcast(VT, V);
14419}
14420
14421/// Generic lowering of 8-lane i16 shuffles.
14422///
14423/// This handles both single-input shuffles and combined shuffle/blends with
14424/// two inputs. The single input shuffles are immediately delegated to
14425/// a dedicated lowering routine.
14426///
14427/// The blends are lowered in one of three fundamental ways. If there are few
14428/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14429/// of the input is significantly cheaper when lowered as an interleaving of
14430/// the two inputs, try to interleave them. Otherwise, blend the low and high
14431/// halves of the inputs separately (making them have relatively few inputs)
14432/// and then concatenate them.
14434 const APInt &Zeroable, SDValue V1, SDValue V2,
14435 const X86Subtarget &Subtarget,
14436 SelectionDAG &DAG) {
14437 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14438 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14439 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14440
14441 // Whenever we can lower this as a zext, that instruction is strictly faster
14442 // than any alternative.
14443 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14444 Zeroable, Subtarget, DAG))
14445 return ZExt;
14446
14447 // Try to use lower using a truncation.
14448 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14449 Subtarget, DAG))
14450 return V;
14451
14452 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14453
14454 if (NumV2Inputs == 0) {
14455 // Try to use shift instructions.
14456 if (SDValue Shift =
14457 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14458 Subtarget, DAG, /*BitwiseOnly*/ false))
14459 return Shift;
14460
14461 // Check for being able to broadcast a single element.
14462 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14463 Mask, Subtarget, DAG))
14464 return Broadcast;
14465
14466 // Try to use bit rotation instructions.
14467 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14468 Subtarget, DAG))
14469 return Rotate;
14470
14471 // Use dedicated unpack instructions for masks that match their pattern.
14472 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14473 return V;
14474
14475 // Use dedicated pack instructions for masks that match their pattern.
14476 if (SDValue V =
14477 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14478 return V;
14479
14480 // Try to use byte rotation instructions.
14481 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14482 Subtarget, DAG))
14483 return Rotate;
14484
14485 // Make a copy of the mask so it can be modified.
14486 SmallVector<int, 8> MutableMask(Mask);
14487 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14488 Subtarget, DAG);
14489 }
14490
14491 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14492 "All single-input shuffles should be canonicalized to be V1-input "
14493 "shuffles.");
14494
14495 // Try to use shift instructions.
14496 if (SDValue Shift =
14497 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14498 DAG, /*BitwiseOnly*/ false))
14499 return Shift;
14500
14501 // See if we can use SSE4A Extraction / Insertion.
14502 if (Subtarget.hasSSE4A())
14503 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14504 Zeroable, DAG))
14505 return V;
14506
14507 // There are special ways we can lower some single-element blends.
14508 if (NumV2Inputs == 1)
14510 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14511 return V;
14512
14513 // We have different paths for blend lowering, but they all must use the
14514 // *exact* same predicate.
14515 bool IsBlendSupported = Subtarget.hasSSE41();
14516 if (IsBlendSupported)
14517 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14518 Zeroable, Subtarget, DAG))
14519 return Blend;
14520
14521 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14522 Zeroable, Subtarget, DAG))
14523 return Masked;
14524
14525 // Use dedicated unpack instructions for masks that match their pattern.
14526 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14527 return V;
14528
14529 // Use dedicated pack instructions for masks that match their pattern.
14530 if (SDValue V =
14531 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14532 return V;
14533
14534 // Try to use lower using a truncation.
14535 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14536 Subtarget, DAG))
14537 return V;
14538
14539 // Try to use byte rotation instructions.
14540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14541 Subtarget, DAG))
14542 return Rotate;
14543
14544 if (SDValue BitBlend =
14545 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14546 return BitBlend;
14547
14548 // Try to use byte shift instructions to mask.
14549 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14550 Zeroable, Subtarget, DAG))
14551 return V;
14552
14553 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14554 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14555 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14556 !Subtarget.hasVLX()) {
14557 // Check if this is part of a 256-bit vector truncation.
14558 unsigned PackOpc = 0;
14559 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14562 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14563 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14564 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14565 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14566 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14567 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14568 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14569 PackOpc = X86ISD::PACKUS;
14570 } else if (Subtarget.hasSSE41()) {
14571 SmallVector<SDValue, 4> DWordClearOps(4,
14572 DAG.getConstant(0, DL, MVT::i32));
14573 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14574 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14575 SDValue DWordClearMask =
14576 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14577 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14578 DWordClearMask);
14579 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14580 DWordClearMask);
14581 PackOpc = X86ISD::PACKUS;
14582 } else if (!Subtarget.hasSSSE3()) {
14583 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14584 V1 = DAG.getBitcast(MVT::v4i32, V1);
14585 V2 = DAG.getBitcast(MVT::v4i32, V2);
14586 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14587 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14588 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14589 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14590 PackOpc = X86ISD::PACKSS;
14591 }
14592 if (PackOpc) {
14593 // Now pack things back together.
14594 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14595 if (NumEvenDrops == 2) {
14596 Result = DAG.getBitcast(MVT::v4i32, Result);
14597 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14598 }
14599 return Result;
14600 }
14601 }
14602
14603 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14604 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14605 if (NumOddDrops == 1) {
14606 bool HasSSE41 = Subtarget.hasSSE41();
14607 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14608 DAG.getBitcast(MVT::v4i32, V1),
14609 DAG.getTargetConstant(16, DL, MVT::i8));
14610 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14611 DAG.getBitcast(MVT::v4i32, V2),
14612 DAG.getTargetConstant(16, DL, MVT::i8));
14613 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14614 MVT::v8i16, V1, V2);
14615 }
14616
14617 // Try to lower by permuting the inputs into an unpack instruction.
14618 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14619 Mask, Subtarget, DAG))
14620 return Unpack;
14621
14622 // If we can't directly blend but can use PSHUFB, that will be better as it
14623 // can both shuffle and set up the inefficient blend.
14624 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14625 bool V1InUse, V2InUse;
14626 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14627 Zeroable, DAG, V1InUse, V2InUse);
14628 }
14629
14630 // We can always bit-blend if we have to so the fallback strategy is to
14631 // decompose into single-input permutes and blends/unpacks.
14632 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14633 Zeroable, Subtarget, DAG);
14634}
14635
14636/// Lower 8-lane 16-bit floating point shuffles.
14638 const APInt &Zeroable, SDValue V1, SDValue V2,
14639 const X86Subtarget &Subtarget,
14640 SelectionDAG &DAG) {
14641 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14642 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14643 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14644 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14645
14646 if (Subtarget.hasFP16()) {
14647 if (NumV2Elements == 0) {
14648 // Check for being able to broadcast a single element.
14649 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14650 Mask, Subtarget, DAG))
14651 return Broadcast;
14652 }
14653 if (NumV2Elements == 1 && Mask[0] >= 8)
14655 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14656 return V;
14657 }
14658
14659 V1 = DAG.getBitcast(MVT::v8i16, V1);
14660 V2 = DAG.getBitcast(MVT::v8i16, V2);
14661 return DAG.getBitcast(MVT::v8f16,
14662 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14663}
14664
14665// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14666// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14667// the active subvector is extracted.
14669 ArrayRef<int> OriginalMask, SDValue V1,
14670 SDValue V2, const X86Subtarget &Subtarget,
14671 SelectionDAG &DAG) {
14672 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14673 SmallVector<int, 32> Mask(OriginalMask);
14674 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14675 !isShuffleFoldableLoad(V2)) {
14677 std::swap(V1, V2);
14678 }
14679
14680 MVT MaskVT = VT.changeTypeToInteger();
14681 SDValue MaskNode;
14682 MVT ShuffleVT = VT;
14683 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14684 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14685 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14686 ShuffleVT = V1.getSimpleValueType();
14687
14688 // Adjust mask to correct indices for the second input.
14689 int NumElts = VT.getVectorNumElements();
14690 unsigned Scale = 512 / VT.getSizeInBits();
14691 SmallVector<int, 32> AdjustedMask(Mask);
14692 for (int &M : AdjustedMask)
14693 if (NumElts <= M)
14694 M += (Scale - 1) * NumElts;
14695 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14696 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14697 } else {
14698 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14699 }
14700
14701 SDValue Result;
14702 if (V2.isUndef())
14703 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14704 else
14705 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14706
14707 if (VT != ShuffleVT)
14708 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14709
14710 return Result;
14711}
14712
14713/// Generic lowering of v16i8 shuffles.
14714///
14715/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14716/// detect any complexity reducing interleaving. If that doesn't help, it uses
14717/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14718/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14719/// back together.
14721 const APInt &Zeroable, SDValue V1, SDValue V2,
14722 const X86Subtarget &Subtarget,
14723 SelectionDAG &DAG) {
14724 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14725 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14726 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14727
14728 // Try to use shift instructions.
14729 if (SDValue Shift =
14730 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14731 DAG, /*BitwiseOnly*/ false))
14732 return Shift;
14733
14734 // Try to use byte rotation instructions.
14735 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14736 Subtarget, DAG))
14737 return Rotate;
14738
14739 // Use dedicated pack instructions for masks that match their pattern.
14740 if (SDValue V =
14741 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14742 return V;
14743
14744 // Try to use a zext lowering.
14745 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14746 Zeroable, Subtarget, DAG))
14747 return ZExt;
14748
14749 // Try to use lower using a truncation.
14750 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14751 Subtarget, DAG))
14752 return V;
14753
14754 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14755 Subtarget, DAG))
14756 return V;
14757
14758 // See if we can use SSE4A Extraction / Insertion.
14759 if (Subtarget.hasSSE4A())
14760 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14761 Zeroable, DAG))
14762 return V;
14763
14764 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14765
14766 // For single-input shuffles, there are some nicer lowering tricks we can use.
14767 if (NumV2Elements == 0) {
14768 // Check for being able to broadcast a single element.
14769 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14770 Mask, Subtarget, DAG))
14771 return Broadcast;
14772
14773 // Try to use bit rotation instructions.
14774 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14775 Subtarget, DAG))
14776 return Rotate;
14777
14778 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14779 return V;
14780
14781 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14782 // Notably, this handles splat and partial-splat shuffles more efficiently.
14783 // However, it only makes sense if the pre-duplication shuffle simplifies
14784 // things significantly. Currently, this means we need to be able to
14785 // express the pre-duplication shuffle as an i16 shuffle.
14786 //
14787 // FIXME: We should check for other patterns which can be widened into an
14788 // i16 shuffle as well.
14789 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14790 for (int i = 0; i < 16; i += 2)
14791 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14792 return false;
14793
14794 return true;
14795 };
14796 auto tryToWidenViaDuplication = [&]() -> SDValue {
14797 if (!canWidenViaDuplication(Mask))
14798 return SDValue();
14799 SmallVector<int, 4> LoInputs;
14800 copy_if(Mask, std::back_inserter(LoInputs),
14801 [](int M) { return M >= 0 && M < 8; });
14802 array_pod_sort(LoInputs.begin(), LoInputs.end());
14803 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14804 SmallVector<int, 4> HiInputs;
14805 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14806 array_pod_sort(HiInputs.begin(), HiInputs.end());
14807 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14808
14809 bool TargetLo = LoInputs.size() >= HiInputs.size();
14810 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14811 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14812
14813 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14815 for (int I : InPlaceInputs) {
14816 PreDupI16Shuffle[I/2] = I/2;
14817 LaneMap[I] = I;
14818 }
14819 int j = TargetLo ? 0 : 4, je = j + 4;
14820 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14821 // Check if j is already a shuffle of this input. This happens when
14822 // there are two adjacent bytes after we move the low one.
14823 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14824 // If we haven't yet mapped the input, search for a slot into which
14825 // we can map it.
14826 while (j < je && PreDupI16Shuffle[j] >= 0)
14827 ++j;
14828
14829 if (j == je)
14830 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14831 return SDValue();
14832
14833 // Map this input with the i16 shuffle.
14834 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14835 }
14836
14837 // Update the lane map based on the mapping we ended up with.
14838 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14839 }
14840 V1 = DAG.getBitcast(
14841 MVT::v16i8,
14842 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14843 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14844
14845 // Unpack the bytes to form the i16s that will be shuffled into place.
14846 bool EvenInUse = false, OddInUse = false;
14847 for (int i = 0; i < 16; i += 2) {
14848 EvenInUse |= (Mask[i + 0] >= 0);
14849 OddInUse |= (Mask[i + 1] >= 0);
14850 if (EvenInUse && OddInUse)
14851 break;
14852 }
14853 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14854 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14855 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14856
14857 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14858 for (int i = 0; i < 16; ++i)
14859 if (Mask[i] >= 0) {
14860 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14861 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14862 if (PostDupI16Shuffle[i / 2] < 0)
14863 PostDupI16Shuffle[i / 2] = MappedMask;
14864 else
14865 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14866 "Conflicting entries in the original shuffle!");
14867 }
14868 return DAG.getBitcast(
14869 MVT::v16i8,
14870 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14871 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14872 };
14873 if (SDValue V = tryToWidenViaDuplication())
14874 return V;
14875 }
14876
14877 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14878 Zeroable, Subtarget, DAG))
14879 return Masked;
14880
14881 // Use dedicated unpack instructions for masks that match their pattern.
14882 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14883 return V;
14884
14885 // Try to use byte shift instructions to mask.
14886 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14887 Zeroable, Subtarget, DAG))
14888 return V;
14889
14890 // Check for compaction patterns.
14891 bool IsSingleInput = V2.isUndef();
14892 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14893
14894 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14895 // with PSHUFB. It is important to do this before we attempt to generate any
14896 // blends but after all of the single-input lowerings. If the single input
14897 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14898 // want to preserve that and we can DAG combine any longer sequences into
14899 // a PSHUFB in the end. But once we start blending from multiple inputs,
14900 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14901 // and there are *very* few patterns that would actually be faster than the
14902 // PSHUFB approach because of its ability to zero lanes.
14903 //
14904 // If the mask is a binary compaction, we can more efficiently perform this
14905 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14906 //
14907 // FIXME: The only exceptions to the above are blends which are exact
14908 // interleavings with direct instructions supporting them. We currently don't
14909 // handle those well here.
14910 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14911 bool V1InUse = false;
14912 bool V2InUse = false;
14913
14915 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14916
14917 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14918 // do so. This avoids using them to handle blends-with-zero which is
14919 // important as a single pshufb is significantly faster for that.
14920 if (V1InUse && V2InUse) {
14921 if (Subtarget.hasSSE41())
14922 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14923 Zeroable, Subtarget, DAG))
14924 return Blend;
14925
14926 // We can use an unpack to do the blending rather than an or in some
14927 // cases. Even though the or may be (very minorly) more efficient, we
14928 // preference this lowering because there are common cases where part of
14929 // the complexity of the shuffles goes away when we do the final blend as
14930 // an unpack.
14931 // FIXME: It might be worth trying to detect if the unpack-feeding
14932 // shuffles will both be pshufb, in which case we shouldn't bother with
14933 // this.
14935 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14936 return Unpack;
14937
14938 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14939 if (Subtarget.hasVBMI())
14940 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14941 DAG);
14942
14943 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14944 if (Subtarget.hasXOP()) {
14945 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14946 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14947 }
14948
14949 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14950 // PALIGNR will be cheaper than the second PSHUFB+OR.
14952 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14953 return V;
14954 }
14955
14956 return PSHUFB;
14957 }
14958
14959 // There are special ways we can lower some single-element blends.
14960 if (NumV2Elements == 1)
14962 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14963 return V;
14964
14965 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14966 return Blend;
14967
14968 // Check whether a compaction lowering can be done. This handles shuffles
14969 // which take every Nth element for some even N. See the helper function for
14970 // details.
14971 //
14972 // We special case these as they can be particularly efficiently handled with
14973 // the PACKUSB instruction on x86 and they show up in common patterns of
14974 // rearranging bytes to truncate wide elements.
14975 if (NumEvenDrops) {
14976 // NumEvenDrops is the power of two stride of the elements. Another way of
14977 // thinking about it is that we need to drop the even elements this many
14978 // times to get the original input.
14979
14980 // First we need to zero all the dropped bytes.
14981 assert(NumEvenDrops <= 3 &&
14982 "No support for dropping even elements more than 3 times.");
14983 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14984 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14985 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14986 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14987 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14988 WordClearMask);
14989 if (!IsSingleInput)
14990 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14991 WordClearMask);
14992
14993 // Now pack things back together.
14994 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14995 IsSingleInput ? V1 : V2);
14996 for (int i = 1; i < NumEvenDrops; ++i) {
14997 Result = DAG.getBitcast(MVT::v8i16, Result);
14998 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14999 }
15000 return Result;
15001 }
15002
15003 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15004 if (NumOddDrops == 1) {
15005 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15006 DAG.getBitcast(MVT::v8i16, V1),
15007 DAG.getTargetConstant(8, DL, MVT::i8));
15008 if (!IsSingleInput)
15009 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15010 DAG.getBitcast(MVT::v8i16, V2),
15011 DAG.getTargetConstant(8, DL, MVT::i8));
15012 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15013 IsSingleInput ? V1 : V2);
15014 }
15015
15016 // Handle multi-input cases by blending/unpacking single-input shuffles.
15017 if (NumV2Elements > 0)
15018 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15019 Zeroable, Subtarget, DAG);
15020
15021 // The fallback path for single-input shuffles widens this into two v8i16
15022 // vectors with unpacks, shuffles those, and then pulls them back together
15023 // with a pack.
15024 SDValue V = V1;
15025
15026 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15027 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15028 for (int i = 0; i < 16; ++i)
15029 if (Mask[i] >= 0)
15030 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15031
15032 SDValue VLoHalf, VHiHalf;
15033 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15034 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15035 // i16s.
15036 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15037 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15038 // Use a mask to drop the high bytes.
15039 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15040 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15041 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15042
15043 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15044 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15045
15046 // Squash the masks to point directly into VLoHalf.
15047 for (int &M : LoBlendMask)
15048 if (M >= 0)
15049 M /= 2;
15050 for (int &M : HiBlendMask)
15051 if (M >= 0)
15052 M /= 2;
15053 } else {
15054 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15055 // VHiHalf so that we can blend them as i16s.
15056 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15057
15058 VLoHalf = DAG.getBitcast(
15059 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15060 VHiHalf = DAG.getBitcast(
15061 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15062 }
15063
15064 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15065 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15066
15067 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15068}
15069
15070/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15071///
15072/// This routine breaks down the specific type of 128-bit shuffle and
15073/// dispatches to the lowering routines accordingly.
15075 MVT VT, SDValue V1, SDValue V2,
15076 const APInt &Zeroable,
15077 const X86Subtarget &Subtarget,
15078 SelectionDAG &DAG) {
15079 if (VT == MVT::v8bf16) {
15080 V1 = DAG.getBitcast(MVT::v8i16, V1);
15081 V2 = DAG.getBitcast(MVT::v8i16, V2);
15082 return DAG.getBitcast(VT,
15083 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15084 }
15085
15086 switch (VT.SimpleTy) {
15087 case MVT::v2i64:
15088 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15089 case MVT::v2f64:
15090 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15091 case MVT::v4i32:
15092 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15093 case MVT::v4f32:
15094 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15095 case MVT::v8i16:
15096 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15097 case MVT::v8f16:
15098 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15099 case MVT::v16i8:
15100 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15101
15102 default:
15103 llvm_unreachable("Unimplemented!");
15104 }
15105}
15106
15107/// Generic routine to split vector shuffle into half-sized shuffles.
15108///
15109/// This routine just extracts two subvectors, shuffles them independently, and
15110/// then concatenates them back together. This should work effectively with all
15111/// AVX vector shuffle types.
15113 SDValue V2, ArrayRef<int> Mask,
15114 SelectionDAG &DAG, bool SimpleOnly) {
15115 assert(VT.getSizeInBits() >= 256 &&
15116 "Only for 256-bit or wider vector shuffles!");
15117 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15118 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15119
15120 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15121 if (VT == MVT::v8f32) {
15122 SDValue BC1 = peekThroughBitcasts(V1);
15123 SDValue BC2 = peekThroughBitcasts(V2);
15124 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15125 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15126 DAG, SimpleOnly))
15127 return DAG.getBitcast(VT, Split);
15128 }
15129 }
15130
15131 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15132 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15133
15134 int NumElements = VT.getVectorNumElements();
15135 int SplitNumElements = NumElements / 2;
15136 MVT ScalarVT = VT.getVectorElementType();
15137 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15138
15139 // Use splitVector/extractSubVector so that split build-vectors just build two
15140 // narrower build vectors. This helps shuffling with splats and zeros.
15141 auto SplitVector = [&](SDValue V) {
15142 SDValue LoV, HiV;
15143 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15144 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15145 DAG.getBitcast(SplitVT, HiV));
15146 };
15147
15148 SDValue LoV1, HiV1, LoV2, HiV2;
15149 std::tie(LoV1, HiV1) = SplitVector(V1);
15150 std::tie(LoV2, HiV2) = SplitVector(V2);
15151
15152 // Now create two 4-way blends of these half-width vectors.
15153 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15154 bool &UseHiV1, bool &UseLoV2,
15155 bool &UseHiV2) {
15156 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15157 for (int i = 0; i < SplitNumElements; ++i) {
15158 int M = HalfMask[i];
15159 if (M >= NumElements) {
15160 if (M >= NumElements + SplitNumElements)
15161 UseHiV2 = true;
15162 else
15163 UseLoV2 = true;
15164 } else if (M >= 0) {
15165 if (M >= SplitNumElements)
15166 UseHiV1 = true;
15167 else
15168 UseLoV1 = true;
15169 }
15170 }
15171 };
15172
15173 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15174 if (!SimpleOnly)
15175 return true;
15176
15177 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15178 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15179
15180 return !(UseHiV1 || UseHiV2);
15181 };
15182
15183 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15184 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15185 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15186 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15187 for (int i = 0; i < SplitNumElements; ++i) {
15188 int M = HalfMask[i];
15189 if (M >= NumElements) {
15190 V2BlendMask[i] = M - NumElements;
15191 BlendMask[i] = SplitNumElements + i;
15192 } else if (M >= 0) {
15193 V1BlendMask[i] = M;
15194 BlendMask[i] = i;
15195 }
15196 }
15197
15198 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15199 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15200
15201 // Because the lowering happens after all combining takes place, we need to
15202 // manually combine these blend masks as much as possible so that we create
15203 // a minimal number of high-level vector shuffle nodes.
15204 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15205
15206 // First try just blending the halves of V1 or V2.
15207 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15208 return DAG.getUNDEF(SplitVT);
15209 if (!UseLoV2 && !UseHiV2)
15210 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15211 if (!UseLoV1 && !UseHiV1)
15212 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15213
15214 SDValue V1Blend, V2Blend;
15215 if (UseLoV1 && UseHiV1) {
15216 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15217 } else {
15218 // We only use half of V1 so map the usage down into the final blend mask.
15219 V1Blend = UseLoV1 ? LoV1 : HiV1;
15220 for (int i = 0; i < SplitNumElements; ++i)
15221 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15222 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15223 }
15224 if (UseLoV2 && UseHiV2) {
15225 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15226 } else {
15227 // We only use half of V2 so map the usage down into the final blend mask.
15228 V2Blend = UseLoV2 ? LoV2 : HiV2;
15229 for (int i = 0; i < SplitNumElements; ++i)
15230 if (BlendMask[i] >= SplitNumElements)
15231 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15232 }
15233 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15234 };
15235
15236 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15237 return SDValue();
15238
15239 SDValue Lo = HalfBlend(LoMask);
15240 SDValue Hi = HalfBlend(HiMask);
15241 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15242}
15243
15244/// Either split a vector in halves or decompose the shuffles and the
15245/// blend/unpack.
15246///
15247/// This is provided as a good fallback for many lowerings of non-single-input
15248/// shuffles with more than one 128-bit lane. In those cases, we want to select
15249/// between splitting the shuffle into 128-bit components and stitching those
15250/// back together vs. extracting the single-input shuffles and blending those
15251/// results.
15253 SDValue V2, ArrayRef<int> Mask,
15254 const APInt &Zeroable,
15255 const X86Subtarget &Subtarget,
15256 SelectionDAG &DAG) {
15257 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15258 "shuffles as it could then recurse on itself.");
15259 int Size = Mask.size();
15260
15261 // If this can be modeled as a broadcast of two elements followed by a blend,
15262 // prefer that lowering. This is especially important because broadcasts can
15263 // often fold with memory operands.
15264 auto DoBothBroadcast = [&] {
15265 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15266 for (int M : Mask)
15267 if (M >= Size) {
15268 if (V2BroadcastIdx < 0)
15269 V2BroadcastIdx = M - Size;
15270 else if ((M - Size) != V2BroadcastIdx &&
15271 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15272 return false;
15273 } else if (M >= 0) {
15274 if (V1BroadcastIdx < 0)
15275 V1BroadcastIdx = M;
15276 else if (M != V1BroadcastIdx &&
15277 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15278 return false;
15279 }
15280 return true;
15281 };
15282 if (DoBothBroadcast())
15283 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15284 Subtarget, DAG);
15285
15286 // If the inputs all stem from a single 128-bit lane of each input, then we
15287 // split them rather than blending because the split will decompose to
15288 // unusually few instructions.
15289 int LaneCount = VT.getSizeInBits() / 128;
15290 int LaneSize = Size / LaneCount;
15291 SmallBitVector LaneInputs[2];
15292 LaneInputs[0].resize(LaneCount, false);
15293 LaneInputs[1].resize(LaneCount, false);
15294 for (int i = 0; i < Size; ++i)
15295 if (Mask[i] >= 0)
15296 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15297 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15298 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15299 /*SimpleOnly*/ false);
15300
15301 // Without AVX2, if we can freely split the subvectors then we're better off
15302 // performing half width shuffles.
15303 if (!Subtarget.hasAVX2()) {
15304 SDValue BC1 = peekThroughBitcasts(V1);
15305 SDValue BC2 = peekThroughBitcasts(V2);
15306 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15307 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15308 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15309 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15310 if (SplatOrSplitV1 && SplatOrSplitV2)
15311 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15312 /*SimpleOnly*/ false);
15313 }
15314
15315 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15316 // requires that the decomposed single-input shuffles don't end up here.
15317 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15318 Subtarget, DAG);
15319}
15320
15321// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15322// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15324 SDValue V1, SDValue V2,
15325 ArrayRef<int> Mask,
15326 SelectionDAG &DAG) {
15327 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15328
15329 int LHSMask[4] = {-1, -1, -1, -1};
15330 int RHSMask[4] = {-1, -1, -1, -1};
15331 int SHUFPDMask[4] = {-1, -1, -1, -1};
15332
15333 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15334 // perform the shuffle once the lanes have been shuffled in place.
15335 for (int i = 0; i != 4; ++i) {
15336 int M = Mask[i];
15337 if (M < 0)
15338 continue;
15339 int LaneBase = i & ~1;
15340 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15341 LaneMask[LaneBase + (M & 1)] = M;
15342 SHUFPDMask[i] = M & 1;
15343 }
15344
15345 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15346 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15347 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15348 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15349}
15350
15351/// Lower a vector shuffle crossing multiple 128-bit lanes as
15352/// a lane permutation followed by a per-lane permutation.
15353///
15354/// This is mainly for cases where we can have non-repeating permutes
15355/// in each lane.
15356///
15357/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15358/// we should investigate merging them.
15360 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15361 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15362 int NumElts = VT.getVectorNumElements();
15363 int NumLanes = VT.getSizeInBits() / 128;
15364 int NumEltsPerLane = NumElts / NumLanes;
15365 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15366
15367 /// Attempts to find a sublane permute with the given size
15368 /// that gets all elements into their target lanes.
15369 ///
15370 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15371 /// If unsuccessful, returns false and may overwrite InLaneMask.
15372 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15373 int NumSublanesPerLane = NumSublanes / NumLanes;
15374 int NumEltsPerSublane = NumElts / NumSublanes;
15375
15376 SmallVector<int, 16> CrossLaneMask;
15377 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15378 // CrossLaneMask but one entry == one sublane.
15379 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15380 APInt DemandedCrossLane = APInt::getZero(NumElts);
15381
15382 for (int i = 0; i != NumElts; ++i) {
15383 int M = Mask[i];
15384 if (M < 0)
15385 continue;
15386
15387 int SrcSublane = M / NumEltsPerSublane;
15388 int DstLane = i / NumEltsPerLane;
15389
15390 // We only need to get the elements into the right lane, not sublane.
15391 // So search all sublanes that make up the destination lane.
15392 bool Found = false;
15393 int DstSubStart = DstLane * NumSublanesPerLane;
15394 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15395 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15396 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15397 continue;
15398
15399 Found = true;
15400 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15401 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15402 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15403 DemandedCrossLane.setBit(InLaneMask[i]);
15404 break;
15405 }
15406 if (!Found)
15407 return SDValue();
15408 }
15409
15410 // Fill CrossLaneMask using CrossLaneMaskLarge.
15411 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15412
15413 if (!CanUseSublanes) {
15414 // If we're only shuffling a single lowest lane and the rest are identity
15415 // then don't bother.
15416 // TODO - isShuffleMaskInputInPlace could be extended to something like
15417 // this.
15418 int NumIdentityLanes = 0;
15419 bool OnlyShuffleLowestLane = true;
15420 for (int i = 0; i != NumLanes; ++i) {
15421 int LaneOffset = i * NumEltsPerLane;
15422 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15423 i * NumEltsPerLane))
15424 NumIdentityLanes++;
15425 else if (CrossLaneMask[LaneOffset] != 0)
15426 OnlyShuffleLowestLane = false;
15427 }
15428 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15429 return SDValue();
15430 }
15431
15432 // Simplify CrossLaneMask based on the actual demanded elements.
15433 if (V1.hasOneUse())
15434 for (int i = 0; i != NumElts; ++i)
15435 if (!DemandedCrossLane[i])
15436 CrossLaneMask[i] = SM_SentinelUndef;
15437
15438 // Avoid returning the same shuffle operation. For example,
15439 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15440 // undef:v16i16
15441 if (CrossLaneMask == Mask || InLaneMask == Mask)
15442 return SDValue();
15443
15444 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15445 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15446 InLaneMask);
15447 };
15448
15449 // First attempt a solution with full lanes.
15450 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15451 return V;
15452
15453 // The rest of the solutions use sublanes.
15454 if (!CanUseSublanes)
15455 return SDValue();
15456
15457 // Then attempt a solution with 64-bit sublanes (vpermq).
15458 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15459 return V;
15460
15461 // If that doesn't work and we have fast variable cross-lane shuffle,
15462 // attempt 32-bit sublanes (vpermd).
15463 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15464 return SDValue();
15465
15466 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15467}
15468
15469/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15470static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15471 SmallVector<int> &InLaneMask) {
15472 int Size = Mask.size();
15473 InLaneMask.assign(Mask.begin(), Mask.end());
15474 for (int i = 0; i < Size; ++i) {
15475 int &M = InLaneMask[i];
15476 if (M < 0)
15477 continue;
15478 if (((M % Size) / LaneSize) != (i / LaneSize))
15479 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15480 }
15481}
15482
15483/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15484/// source with a lane permutation.
15485///
15486/// This lowering strategy results in four instructions in the worst case for a
15487/// single-input cross lane shuffle which is lower than any other fully general
15488/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15489/// shuffle pattern should be handled prior to trying this lowering.
15491 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15492 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15493 // FIXME: This should probably be generalized for 512-bit vectors as well.
15494 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15495 int Size = Mask.size();
15496 int LaneSize = Size / 2;
15497
15498 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15499 // Only do this if the elements aren't all from the lower lane,
15500 // otherwise we're (probably) better off doing a split.
15501 if (VT == MVT::v4f64 &&
15502 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15503 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15504
15505 // If there are only inputs from one 128-bit lane, splitting will in fact be
15506 // less expensive. The flags track whether the given lane contains an element
15507 // that crosses to another lane.
15508 bool AllLanes;
15509 if (!Subtarget.hasAVX2()) {
15510 bool LaneCrossing[2] = {false, false};
15511 for (int i = 0; i < Size; ++i)
15512 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15513 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15514 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15515 } else {
15516 bool LaneUsed[2] = {false, false};
15517 for (int i = 0; i < Size; ++i)
15518 if (Mask[i] >= 0)
15519 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15520 AllLanes = LaneUsed[0] && LaneUsed[1];
15521 }
15522
15523 // TODO - we could support shuffling V2 in the Flipped input.
15524 assert(V2.isUndef() &&
15525 "This last part of this routine only works on single input shuffles");
15526
15527 SmallVector<int> InLaneMask;
15528 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15529
15530 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15531 "In-lane shuffle mask expected");
15532
15533 // If we're not using both lanes in each lane and the inlane mask is not
15534 // repeating, then we're better off splitting.
15535 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15536 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15537 /*SimpleOnly*/ false);
15538
15539 // Flip the lanes, and shuffle the results which should now be in-lane.
15540 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15541 SDValue Flipped = DAG.getBitcast(PVT, V1);
15542 Flipped =
15543 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15544 Flipped = DAG.getBitcast(VT, Flipped);
15545 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15546}
15547
15548/// Handle lowering 2-lane 128-bit shuffles.
15550 SDValue V2, ArrayRef<int> Mask,
15551 const APInt &Zeroable,
15552 const X86Subtarget &Subtarget,
15553 SelectionDAG &DAG) {
15554 if (V2.isUndef()) {
15555 // Attempt to match VBROADCAST*128 subvector broadcast load.
15556 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15557 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15558 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15560 MVT MemVT = VT.getHalfNumVectorElementsVT();
15561 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15564 VT, MemVT, Ld, Ofs, DAG))
15565 return BcstLd;
15566 }
15567
15568 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15569 if (Subtarget.hasAVX2())
15570 return SDValue();
15571 }
15572
15573 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15574
15575 SmallVector<int, 4> WidenedMask;
15576 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15577 return SDValue();
15578
15579 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15580 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15581
15582 // Try to use an insert into a zero vector.
15583 if (WidenedMask[0] == 0 && IsHighZero) {
15584 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15585 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15586 DAG.getVectorIdxConstant(0, DL));
15587 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15588 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15589 DAG.getVectorIdxConstant(0, DL));
15590 }
15591
15592 // TODO: If minimizing size and one of the inputs is a zero vector and the
15593 // the zero vector has only one use, we could use a VPERM2X128 to save the
15594 // instruction bytes needed to explicitly generate the zero vector.
15595
15596 // Blends are faster and handle all the non-lane-crossing cases.
15597 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15598 Subtarget, DAG))
15599 return Blend;
15600
15601 // If either input operand is a zero vector, use VPERM2X128 because its mask
15602 // allows us to replace the zero input with an implicit zero.
15603 if (!IsLowZero && !IsHighZero) {
15604 // Check for patterns which can be matched with a single insert of a 128-bit
15605 // subvector.
15606 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15607 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15608
15609 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15610 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15612 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15613 SDValue SubVec =
15614 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15615 DAG.getVectorIdxConstant(0, DL));
15616 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15617 DAG.getVectorIdxConstant(2, DL));
15618 }
15619 }
15620
15621 // Try to use SHUF128 if possible.
15622 if (Subtarget.hasVLX()) {
15623 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15624 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15625 ((WidenedMask[1] % 2) << 1);
15626 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15627 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15628 }
15629 }
15630 }
15631
15632 // Otherwise form a 128-bit permutation. After accounting for undefs,
15633 // convert the 64-bit shuffle mask selection values into 128-bit
15634 // selection bits by dividing the indexes by 2 and shifting into positions
15635 // defined by a vperm2*128 instruction's immediate control byte.
15636
15637 // The immediate permute control byte looks like this:
15638 // [1:0] - select 128 bits from sources for low half of destination
15639 // [2] - ignore
15640 // [3] - zero low half of destination
15641 // [5:4] - select 128 bits from sources for high half of destination
15642 // [6] - ignore
15643 // [7] - zero high half of destination
15644
15645 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15646 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15647
15648 unsigned PermMask = 0;
15649 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15650 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15651
15652 // Check the immediate mask and replace unused sources with undef.
15653 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15654 V1 = DAG.getUNDEF(VT);
15655 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15656 V2 = DAG.getUNDEF(VT);
15657
15658 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15659 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15660}
15661
15662/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15663/// shuffling each lane.
15664///
15665/// This attempts to create a repeated lane shuffle where each lane uses one
15666/// or two of the lanes of the inputs. The lanes of the input vectors are
15667/// shuffled in one or two independent shuffles to get the lanes into the
15668/// position needed by the final shuffle.
15670 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15671 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15672 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15673
15674 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15675 return SDValue();
15676
15677 int NumElts = Mask.size();
15678 int NumLanes = VT.getSizeInBits() / 128;
15679 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15680 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15681 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15682
15683 // First pass will try to fill in the RepeatMask from lanes that need two
15684 // sources.
15685 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15686 int Srcs[2] = {-1, -1};
15687 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15688 for (int i = 0; i != NumLaneElts; ++i) {
15689 int M = Mask[(Lane * NumLaneElts) + i];
15690 if (M < 0)
15691 continue;
15692 // Determine which of the possible input lanes (NumLanes from each source)
15693 // this element comes from. Assign that as one of the sources for this
15694 // lane. We can assign up to 2 sources for this lane. If we run out
15695 // sources we can't do anything.
15696 int LaneSrc = M / NumLaneElts;
15697 int Src;
15698 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15699 Src = 0;
15700 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15701 Src = 1;
15702 else
15703 return SDValue();
15704
15705 Srcs[Src] = LaneSrc;
15706 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15707 }
15708
15709 // If this lane has two sources, see if it fits with the repeat mask so far.
15710 if (Srcs[1] < 0)
15711 continue;
15712
15713 LaneSrcs[Lane][0] = Srcs[0];
15714 LaneSrcs[Lane][1] = Srcs[1];
15715
15716 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15717 assert(M1.size() == M2.size() && "Unexpected mask size");
15718 for (int i = 0, e = M1.size(); i != e; ++i)
15719 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15720 return false;
15721 return true;
15722 };
15723
15724 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15725 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15726 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15727 int M = Mask[i];
15728 if (M < 0)
15729 continue;
15730 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15731 "Unexpected mask element");
15732 MergedMask[i] = M;
15733 }
15734 };
15735
15736 if (MatchMasks(InLaneMask, RepeatMask)) {
15737 // Merge this lane mask into the final repeat mask.
15738 MergeMasks(InLaneMask, RepeatMask);
15739 continue;
15740 }
15741
15742 // Didn't find a match. Swap the operands and try again.
15743 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15745
15746 if (MatchMasks(InLaneMask, RepeatMask)) {
15747 // Merge this lane mask into the final repeat mask.
15748 MergeMasks(InLaneMask, RepeatMask);
15749 continue;
15750 }
15751
15752 // Couldn't find a match with the operands in either order.
15753 return SDValue();
15754 }
15755
15756 // Now handle any lanes with only one source.
15757 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15758 // If this lane has already been processed, skip it.
15759 if (LaneSrcs[Lane][0] >= 0)
15760 continue;
15761
15762 for (int i = 0; i != NumLaneElts; ++i) {
15763 int M = Mask[(Lane * NumLaneElts) + i];
15764 if (M < 0)
15765 continue;
15766
15767 // If RepeatMask isn't defined yet we can define it ourself.
15768 if (RepeatMask[i] < 0)
15769 RepeatMask[i] = M % NumLaneElts;
15770
15771 if (RepeatMask[i] < NumElts) {
15772 if (RepeatMask[i] != M % NumLaneElts)
15773 return SDValue();
15774 LaneSrcs[Lane][0] = M / NumLaneElts;
15775 } else {
15776 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15777 return SDValue();
15778 LaneSrcs[Lane][1] = M / NumLaneElts;
15779 }
15780 }
15781
15782 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15783 return SDValue();
15784 }
15785
15786 SmallVector<int, 16> NewMask(NumElts, -1);
15787 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15788 int Src = LaneSrcs[Lane][0];
15789 for (int i = 0; i != NumLaneElts; ++i) {
15790 int M = -1;
15791 if (Src >= 0)
15792 M = Src * NumLaneElts + i;
15793 NewMask[Lane * NumLaneElts + i] = M;
15794 }
15795 }
15796 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15797 // Ensure we didn't get back the shuffle we started with.
15798 // FIXME: This is a hack to make up for some splat handling code in
15799 // getVectorShuffle.
15800 if (isa<ShuffleVectorSDNode>(NewV1) &&
15801 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15802 return SDValue();
15803
15804 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15805 int Src = LaneSrcs[Lane][1];
15806 for (int i = 0; i != NumLaneElts; ++i) {
15807 int M = -1;
15808 if (Src >= 0)
15809 M = Src * NumLaneElts + i;
15810 NewMask[Lane * NumLaneElts + i] = M;
15811 }
15812 }
15813 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15814 // Ensure we didn't get back the shuffle we started with.
15815 // FIXME: This is a hack to make up for some splat handling code in
15816 // getVectorShuffle.
15817 if (isa<ShuffleVectorSDNode>(NewV2) &&
15818 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15819 return SDValue();
15820
15821 for (int i = 0; i != NumElts; ++i) {
15822 if (Mask[i] < 0) {
15823 NewMask[i] = -1;
15824 continue;
15825 }
15826 NewMask[i] = RepeatMask[i % NumLaneElts];
15827 if (NewMask[i] < 0)
15828 continue;
15829
15830 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15831 }
15832 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15833}
15834
15835/// If the input shuffle mask results in a vector that is undefined in all upper
15836/// or lower half elements and that mask accesses only 2 halves of the
15837/// shuffle's operands, return true. A mask of half the width with mask indexes
15838/// adjusted to access the extracted halves of the original shuffle operands is
15839/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15840/// lower half of each input operand is accessed.
15841static bool
15843 int &HalfIdx1, int &HalfIdx2) {
15844 assert((Mask.size() == HalfMask.size() * 2) &&
15845 "Expected input mask to be twice as long as output");
15846
15847 // Exactly one half of the result must be undef to allow narrowing.
15848 bool UndefLower = isUndefLowerHalf(Mask);
15849 bool UndefUpper = isUndefUpperHalf(Mask);
15850 if (UndefLower == UndefUpper)
15851 return false;
15852
15853 unsigned HalfNumElts = HalfMask.size();
15854 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15855 HalfIdx1 = -1;
15856 HalfIdx2 = -1;
15857 for (unsigned i = 0; i != HalfNumElts; ++i) {
15858 int M = Mask[i + MaskIndexOffset];
15859 if (M < 0) {
15860 HalfMask[i] = M;
15861 continue;
15862 }
15863
15864 // Determine which of the 4 half vectors this element is from.
15865 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15866 int HalfIdx = M / HalfNumElts;
15867
15868 // Determine the element index into its half vector source.
15869 int HalfElt = M % HalfNumElts;
15870
15871 // We can shuffle with up to 2 half vectors, set the new 'half'
15872 // shuffle mask accordingly.
15873 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15874 HalfMask[i] = HalfElt;
15875 HalfIdx1 = HalfIdx;
15876 continue;
15877 }
15878 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15879 HalfMask[i] = HalfElt + HalfNumElts;
15880 HalfIdx2 = HalfIdx;
15881 continue;
15882 }
15883
15884 // Too many half vectors referenced.
15885 return false;
15886 }
15887
15888 return true;
15889}
15890
15891/// Given the output values from getHalfShuffleMask(), create a half width
15892/// shuffle of extracted vectors followed by an insert back to full width.
15894 ArrayRef<int> HalfMask, int HalfIdx1,
15895 int HalfIdx2, bool UndefLower,
15896 SelectionDAG &DAG, bool UseConcat = false) {
15897 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15898 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15899
15900 MVT VT = V1.getSimpleValueType();
15901 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15902 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15903
15904 auto getHalfVector = [&](int HalfIdx) {
15905 if (HalfIdx < 0)
15906 return DAG.getUNDEF(HalfVT);
15907 SDValue V = (HalfIdx < 2 ? V1 : V2);
15908 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15909 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15910 DAG.getVectorIdxConstant(HalfIdx, DL));
15911 };
15912
15913 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15914 SDValue Half1 = getHalfVector(HalfIdx1);
15915 SDValue Half2 = getHalfVector(HalfIdx2);
15916 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15917 if (UseConcat) {
15918 SDValue Op0 = V;
15919 SDValue Op1 = DAG.getUNDEF(HalfVT);
15920 if (UndefLower)
15921 std::swap(Op0, Op1);
15922 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15923 }
15924
15925 unsigned Offset = UndefLower ? HalfNumElts : 0;
15926 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15928}
15929
15930/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15931/// This allows for fast cases such as subvector extraction/insertion
15932/// or shuffling smaller vector types which can lower more efficiently.
15934 SDValue V2, ArrayRef<int> Mask,
15935 const X86Subtarget &Subtarget,
15936 SelectionDAG &DAG) {
15937 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15938 "Expected 256-bit or 512-bit vector");
15939
15940 bool UndefLower = isUndefLowerHalf(Mask);
15941 if (!UndefLower && !isUndefUpperHalf(Mask))
15942 return SDValue();
15943
15944 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15945 "Completely undef shuffle mask should have been simplified already");
15946
15947 // Upper half is undef and lower half is whole upper subvector.
15948 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15949 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15950 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15951 if (!UndefLower &&
15952 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15953 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15954 DAG.getVectorIdxConstant(HalfNumElts, DL));
15955 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15956 DAG.getVectorIdxConstant(0, DL));
15957 }
15958
15959 // Lower half is undef and upper half is whole lower subvector.
15960 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15961 if (UndefLower &&
15962 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15963 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15964 DAG.getVectorIdxConstant(0, DL));
15965 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15966 DAG.getVectorIdxConstant(HalfNumElts, DL));
15967 }
15968
15969 int HalfIdx1, HalfIdx2;
15970 SmallVector<int, 8> HalfMask(HalfNumElts);
15971 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15972 return SDValue();
15973
15974 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15975
15976 // Only shuffle the halves of the inputs when useful.
15977 unsigned NumLowerHalves =
15978 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15979 unsigned NumUpperHalves =
15980 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15981 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15982
15983 // Determine the larger pattern of undef/halves, then decide if it's worth
15984 // splitting the shuffle based on subtarget capabilities and types.
15985 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15986 if (!UndefLower) {
15987 // XXXXuuuu: no insert is needed.
15988 // Always extract lowers when setting lower - these are all free subreg ops.
15989 if (NumUpperHalves == 0)
15990 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15991 UndefLower, DAG);
15992
15993 if (NumUpperHalves == 1) {
15994 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15995 if (Subtarget.hasAVX2()) {
15996 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15997 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15998 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15999 (!isSingleSHUFPSMask(HalfMask) ||
16000 Subtarget.hasFastVariableCrossLaneShuffle()))
16001 return SDValue();
16002 // If this is an unary shuffle (assume that the 2nd operand is
16003 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16004 // are better off extracting the upper half of 1 operand and using a
16005 // narrow shuffle.
16006 if (EltWidth == 64 && V2.isUndef())
16007 return SDValue();
16008 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16009 // full width pshufb, and then merge.
16010 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16011 return SDValue();
16012 }
16013 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16014 if (Subtarget.hasAVX512() && VT.is512BitVector())
16015 return SDValue();
16016 // Extract + narrow shuffle is better than the wide alternative.
16017 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16018 UndefLower, DAG);
16019 }
16020
16021 // Don't extract both uppers, instead shuffle and then extract.
16022 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16023 return SDValue();
16024 }
16025
16026 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16027 if (NumUpperHalves == 0) {
16028 // AVX2 has efficient 64-bit element cross-lane shuffles.
16029 // TODO: Refine to account for unary shuffle, splat, and other masks?
16030 if (Subtarget.hasAVX2() && EltWidth == 64)
16031 return SDValue();
16032 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16033 if (Subtarget.hasAVX512() && VT.is512BitVector())
16034 return SDValue();
16035 // Narrow shuffle + insert is better than the wide alternative.
16036 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16037 UndefLower, DAG);
16038 }
16039
16040 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16041 return SDValue();
16042}
16043
16044/// Handle case where shuffle sources are coming from the same 128-bit lane and
16045/// every lane can be represented as the same repeating mask - allowing us to
16046/// shuffle the sources with the repeating shuffle and then permute the result
16047/// to the destination lanes.
16049 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16050 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16051 int NumElts = VT.getVectorNumElements();
16052 int NumLanes = VT.getSizeInBits() / 128;
16053 int NumLaneElts = NumElts / NumLanes;
16054
16055 // On AVX2 we may be able to just shuffle the lowest elements and then
16056 // broadcast the result.
16057 if (Subtarget.hasAVX2()) {
16058 for (unsigned BroadcastSize : {16, 32, 64}) {
16059 if (BroadcastSize <= VT.getScalarSizeInBits())
16060 continue;
16061 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16062
16063 // Attempt to match a repeating pattern every NumBroadcastElts,
16064 // accounting for UNDEFs but only references the lowest 128-bit
16065 // lane of the inputs.
16066 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16067 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16068 for (int j = 0; j != NumBroadcastElts; ++j) {
16069 int M = Mask[i + j];
16070 if (M < 0)
16071 continue;
16072 int &R = RepeatMask[j];
16073 if (0 != ((M % NumElts) / NumLaneElts))
16074 return false;
16075 if (0 <= R && R != M)
16076 return false;
16077 R = M;
16078 }
16079 return true;
16080 };
16081
16082 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16083 if (!FindRepeatingBroadcastMask(RepeatMask))
16084 continue;
16085
16086 // Shuffle the (lowest) repeated elements in place for broadcast.
16087 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16088
16089 // Shuffle the actual broadcast.
16090 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16091 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16092 for (int j = 0; j != NumBroadcastElts; ++j)
16093 BroadcastMask[i + j] = j;
16094
16095 // Avoid returning the same shuffle operation. For example,
16096 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16097 if (BroadcastMask == Mask)
16098 return SDValue();
16099
16100 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16101 BroadcastMask);
16102 }
16103 }
16104
16105 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16106 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16107 return SDValue();
16108
16109 // Bail if we already have a repeated lane shuffle mask.
16110 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16111 return SDValue();
16112
16113 // Helper to look for repeated mask in each split sublane, and that those
16114 // sublanes can then be permuted into place.
16115 auto ShuffleSubLanes = [&](int SubLaneScale) {
16116 int NumSubLanes = NumLanes * SubLaneScale;
16117 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16118
16119 // Check that all the sources are coming from the same lane and see if we
16120 // can form a repeating shuffle mask (local to each sub-lane). At the same
16121 // time, determine the source sub-lane for each destination sub-lane.
16122 int TopSrcSubLane = -1;
16123 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16124 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16125 SubLaneScale,
16126 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16127
16128 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16129 // Extract the sub-lane mask, check that it all comes from the same lane
16130 // and normalize the mask entries to come from the first lane.
16131 int SrcLane = -1;
16132 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16133 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16134 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16135 if (M < 0)
16136 continue;
16137 int Lane = (M % NumElts) / NumLaneElts;
16138 if ((0 <= SrcLane) && (SrcLane != Lane))
16139 return SDValue();
16140 SrcLane = Lane;
16141 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16142 SubLaneMask[Elt] = LocalM;
16143 }
16144
16145 // Whole sub-lane is UNDEF.
16146 if (SrcLane < 0)
16147 continue;
16148
16149 // Attempt to match against the candidate repeated sub-lane masks.
16150 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16151 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16152 for (int i = 0; i != NumSubLaneElts; ++i) {
16153 if (M1[i] < 0 || M2[i] < 0)
16154 continue;
16155 if (M1[i] != M2[i])
16156 return false;
16157 }
16158 return true;
16159 };
16160
16161 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16162 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16163 continue;
16164
16165 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16166 for (int i = 0; i != NumSubLaneElts; ++i) {
16167 int M = SubLaneMask[i];
16168 if (M < 0)
16169 continue;
16170 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16171 "Unexpected mask element");
16172 RepeatedSubLaneMask[i] = M;
16173 }
16174
16175 // Track the top most source sub-lane - by setting the remaining to
16176 // UNDEF we can greatly simplify shuffle matching.
16177 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16178 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16179 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16180 break;
16181 }
16182
16183 // Bail if we failed to find a matching repeated sub-lane mask.
16184 if (Dst2SrcSubLanes[DstSubLane] < 0)
16185 return SDValue();
16186 }
16187 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16188 "Unexpected source lane");
16189
16190 // Create a repeating shuffle mask for the entire vector.
16191 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16192 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16193 int Lane = SubLane / SubLaneScale;
16194 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16195 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16196 int M = RepeatedSubLaneMask[Elt];
16197 if (M < 0)
16198 continue;
16199 int Idx = (SubLane * NumSubLaneElts) + Elt;
16200 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16201 }
16202 }
16203
16204 // Shuffle each source sub-lane to its destination.
16205 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16206 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16207 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16208 if (SrcSubLane < 0)
16209 continue;
16210 for (int j = 0; j != NumSubLaneElts; ++j)
16211 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16212 }
16213
16214 // Avoid returning the same shuffle operation.
16215 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16216 if (RepeatedMask == Mask || SubLaneMask == Mask)
16217 return SDValue();
16218
16219 SDValue RepeatedShuffle =
16220 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16221
16222 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16223 SubLaneMask);
16224 };
16225
16226 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16227 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16228 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16229 // Otherwise we can only permute whole 128-bit lanes.
16230 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16231 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16232 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16233 MinSubLaneScale = 2;
16234 MaxSubLaneScale =
16235 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16236 }
16237 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16238 MinSubLaneScale = MaxSubLaneScale = 4;
16239
16240 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16241 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16242 return Shuffle;
16243
16244 return SDValue();
16245}
16246
16248 bool &ForceV1Zero, bool &ForceV2Zero,
16249 unsigned &ShuffleImm, ArrayRef<int> Mask,
16250 const APInt &Zeroable) {
16251 int NumElts = VT.getVectorNumElements();
16252 assert(VT.getScalarSizeInBits() == 64 &&
16253 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16254 "Unexpected data type for VSHUFPD");
16255 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16256 "Illegal shuffle mask");
16257
16258 bool ZeroLane[2] = { true, true };
16259 for (int i = 0; i < NumElts; ++i)
16260 ZeroLane[i & 1] &= Zeroable[i];
16261
16262 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16263 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16264 bool IsSHUFPD = true;
16265 bool IsCommutable = true;
16266 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16267 for (int i = 0; i < NumElts; ++i) {
16268 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16269 continue;
16270 if (Mask[i] < 0)
16271 return false;
16272 int Val = (i & 6) + NumElts * (i & 1);
16273 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16274 if (Mask[i] < Val || Mask[i] > Val + 1)
16275 IsSHUFPD = false;
16276 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16277 IsCommutable = false;
16278 SHUFPDMask[i] = Mask[i] % 2;
16279 }
16280
16281 if (!IsSHUFPD && !IsCommutable)
16282 return false;
16283
16284 if (!IsSHUFPD && IsCommutable)
16285 std::swap(V1, V2);
16286
16287 ForceV1Zero = ZeroLane[0];
16288 ForceV2Zero = ZeroLane[1];
16289 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16290 return true;
16291}
16292
16294 SDValue V2, ArrayRef<int> Mask,
16295 const APInt &Zeroable,
16296 const X86Subtarget &Subtarget,
16297 SelectionDAG &DAG) {
16298 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16299 "Unexpected data type for VSHUFPD");
16300
16301 unsigned Immediate = 0;
16302 bool ForceV1Zero = false, ForceV2Zero = false;
16303 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16304 Mask, Zeroable))
16305 return SDValue();
16306
16307 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16308 if (ForceV1Zero)
16309 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16310 if (ForceV2Zero)
16311 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16312
16313 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16314 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16315}
16316
16317// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16318// by zeroable elements in the remaining 24 elements. Turn this into two
16319// vmovqb instructions shuffled together.
16321 SDValue V1, SDValue V2,
16322 ArrayRef<int> Mask,
16323 const APInt &Zeroable,
16324 SelectionDAG &DAG) {
16325 assert(VT == MVT::v32i8 && "Unexpected type!");
16326
16327 // The first 8 indices should be every 8th element.
16328 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16329 return SDValue();
16330
16331 // Remaining elements need to be zeroable.
16332 if (Zeroable.countl_one() < (Mask.size() - 8))
16333 return SDValue();
16334
16335 V1 = DAG.getBitcast(MVT::v4i64, V1);
16336 V2 = DAG.getBitcast(MVT::v4i64, V2);
16337
16338 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16339 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16340
16341 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16342 // the upper bits of the result using an unpckldq.
16343 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16344 { 0, 1, 2, 3, 16, 17, 18, 19,
16345 4, 5, 6, 7, 20, 21, 22, 23 });
16346 // Insert the unpckldq into a zero vector to widen to v32i8.
16347 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16348 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16349 DAG.getVectorIdxConstant(0, DL));
16350}
16351
16352// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16353// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16354// =>
16355// ul = unpckl v1, v2
16356// uh = unpckh v1, v2
16357// a = vperm ul, uh
16358// b = vperm ul, uh
16359//
16360// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16361// and permute. We cannot directly match v3 because it is split into two
16362// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16363// pair of 256-bit shuffles and makes sure the masks are consecutive.
16364//
16365// Once unpck and permute nodes are created, the permute corresponding to this
16366// shuffle is returned, while the other permute replaces the other half of the
16367// shuffle in the selection dag.
16369 SDValue V1, SDValue V2,
16370 ArrayRef<int> Mask,
16371 SelectionDAG &DAG) {
16372 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16373 VT != MVT::v32i8)
16374 return SDValue();
16375 // <B0, B1, B0+1, B1+1, ..., >
16376 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16377 unsigned Begin1) {
16378 size_t Size = Mask.size();
16379 assert(Size % 2 == 0 && "Expected even mask size");
16380 for (unsigned I = 0; I < Size; I += 2) {
16381 if (Mask[I] != (int)(Begin0 + I / 2) ||
16382 Mask[I + 1] != (int)(Begin1 + I / 2))
16383 return false;
16384 }
16385 return true;
16386 };
16387 // Check which half is this shuffle node
16388 int NumElts = VT.getVectorNumElements();
16389 size_t FirstQtr = NumElts / 2;
16390 size_t ThirdQtr = NumElts + NumElts / 2;
16391 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16392 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16393 if (!IsFirstHalf && !IsSecondHalf)
16394 return SDValue();
16395
16396 // Find the intersection between shuffle users of V1 and V2.
16397 SmallVector<SDNode *, 2> Shuffles;
16398 for (SDNode *User : V1->users())
16399 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16400 User->getOperand(1) == V2)
16401 Shuffles.push_back(User);
16402 // Limit user size to two for now.
16403 if (Shuffles.size() != 2)
16404 return SDValue();
16405 // Find out which half of the 512-bit shuffles is each smaller shuffle
16406 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16407 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16408 SDNode *FirstHalf;
16409 SDNode *SecondHalf;
16410 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16411 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16412 FirstHalf = Shuffles[0];
16413 SecondHalf = Shuffles[1];
16414 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16415 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16416 FirstHalf = Shuffles[1];
16417 SecondHalf = Shuffles[0];
16418 } else {
16419 return SDValue();
16420 }
16421 // Lower into unpck and perm. Return the perm of this shuffle and replace
16422 // the other.
16423 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16424 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16425 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16426 DAG.getTargetConstant(0x20, DL, MVT::i8));
16427 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16428 DAG.getTargetConstant(0x31, DL, MVT::i8));
16429 if (IsFirstHalf) {
16430 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16431 return Perm1;
16432 }
16433 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16434 return Perm2;
16435}
16436
16437/// Handle lowering of 4-lane 64-bit floating point shuffles.
16438///
16439/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16440/// isn't available.
16442 const APInt &Zeroable, SDValue V1, SDValue V2,
16443 const X86Subtarget &Subtarget,
16444 SelectionDAG &DAG) {
16445 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16446 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16447 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16448
16449 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16450 Subtarget, DAG))
16451 return V;
16452
16453 if (V2.isUndef()) {
16454 // Check for being able to broadcast a single element.
16455 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16456 Mask, Subtarget, DAG))
16457 return Broadcast;
16458
16459 // Use low duplicate instructions for masks that match their pattern.
16460 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16461 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16462
16463 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16464 // Non-half-crossing single input shuffles can be lowered with an
16465 // interleaved permutation.
16466 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16467 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16468 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16469 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16470 }
16471
16472 // With AVX2 we have direct support for this permutation.
16473 if (Subtarget.hasAVX2())
16474 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16475 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16476
16477 // Try to create an in-lane repeating shuffle mask and then shuffle the
16478 // results into the target lanes.
16480 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16481 return V;
16482
16483 // Try to permute the lanes and then use a per-lane permute.
16484 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16485 Mask, DAG, Subtarget))
16486 return V;
16487
16488 // Otherwise, fall back.
16489 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16490 DAG, Subtarget);
16491 }
16492
16493 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16494 Zeroable, Subtarget, DAG))
16495 return Blend;
16496
16497 // Use dedicated unpack instructions for masks that match their pattern.
16498 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16499 return V;
16500
16501 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16502 Zeroable, Subtarget, DAG))
16503 return Op;
16504
16505 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16506 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16507 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16508 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16509
16510 // If we have lane crossing shuffles AND they don't all come from the lower
16511 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16512 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16513 // canonicalize to a blend of splat which isn't necessary for this combine.
16514 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16515 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16516 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16517 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16518 (!Subtarget.hasAVX2() ||
16519 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16520 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16521
16522 // If we have one input in place, then we can permute the other input and
16523 // blend the result.
16524 if (V1IsInPlace || V2IsInPlace)
16525 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16526 Zeroable, Subtarget, DAG);
16527
16528 // Try to create an in-lane repeating shuffle mask and then shuffle the
16529 // results into the target lanes.
16531 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16532 return V;
16533
16534 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16535 // shuffle. However, if we have AVX2 and either inputs are already in place,
16536 // we will be able to shuffle even across lanes the other input in a single
16537 // instruction so skip this pattern.
16538 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16540 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16541 return V;
16542
16543 // If we have VLX support, we can use VEXPAND.
16544 if (Subtarget.hasVLX())
16545 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16546 Zeroable, Subtarget, DAG))
16547 return V;
16548
16549 // If we have AVX2 then we always want to lower with a blend because an v4 we
16550 // can fully permute the elements.
16551 if (Subtarget.hasAVX2())
16552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16553 Zeroable, Subtarget, DAG);
16554
16555 // Otherwise fall back on generic lowering.
16556 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16557 Subtarget, DAG);
16558}
16559
16560/// Handle lowering of 4-lane 64-bit integer shuffles.
16561///
16562/// This routine is only called when we have AVX2 and thus a reasonable
16563/// instruction set for v4i64 shuffling..
16565 const APInt &Zeroable, SDValue V1, SDValue V2,
16566 const X86Subtarget &Subtarget,
16567 SelectionDAG &DAG) {
16568 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16569 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16570 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16571 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16572
16573 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16574 Subtarget, DAG))
16575 return V;
16576
16577 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16578 Zeroable, Subtarget, DAG))
16579 return Blend;
16580
16581 // Check for being able to broadcast a single element.
16582 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16583 Subtarget, DAG))
16584 return Broadcast;
16585
16586 // Try to use shift instructions if fast.
16587 if (Subtarget.preferLowerShuffleAsShift())
16588 if (SDValue Shift =
16589 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16590 Subtarget, DAG, /*BitwiseOnly*/ true))
16591 return Shift;
16592
16593 if (V2.isUndef()) {
16594 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16595 // can use lower latency instructions that will operate on both lanes.
16596 SmallVector<int, 2> RepeatedMask;
16597 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16598 SmallVector<int, 4> PSHUFDMask;
16599 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16600 return DAG.getBitcast(
16601 MVT::v4i64,
16602 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16603 DAG.getBitcast(MVT::v8i32, V1),
16604 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16605 }
16606
16607 // AVX2 provides a direct instruction for permuting a single input across
16608 // lanes.
16609 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16610 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16611 }
16612
16613 // Try to use shift instructions.
16614 if (SDValue Shift =
16615 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16616 DAG, /*BitwiseOnly*/ false))
16617 return Shift;
16618
16619 // If we have VLX support, we can use VALIGN or VEXPAND.
16620 if (Subtarget.hasVLX()) {
16621 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16622 Zeroable, Subtarget, DAG))
16623 return Rotate;
16624
16625 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16626 Zeroable, Subtarget, DAG))
16627 return V;
16628 }
16629
16630 // Try to use PALIGNR.
16631 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16632 Subtarget, DAG))
16633 return Rotate;
16634
16635 // Use dedicated unpack instructions for masks that match their pattern.
16636 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16637 return V;
16638
16639 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16640 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16641
16642 // If we have one input in place, then we can permute the other input and
16643 // blend the result.
16644 if (V1IsInPlace || V2IsInPlace)
16645 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16646 Zeroable, Subtarget, DAG);
16647
16648 // Try to create an in-lane repeating shuffle mask and then shuffle the
16649 // results into the target lanes.
16651 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16652 return V;
16653
16654 // Try to lower to PERMQ(BLENDD(V1,V2)).
16655 if (SDValue V =
16656 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16657 return V;
16658
16659 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16660 // shuffle. However, if we have AVX2 and either inputs are already in place,
16661 // we will be able to shuffle even across lanes the other input in a single
16662 // instruction so skip this pattern.
16663 if (!V1IsInPlace && !V2IsInPlace)
16665 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16666 return Result;
16667
16668 // Otherwise fall back on generic blend lowering.
16669 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16670 Zeroable, Subtarget, DAG);
16671}
16672
16673/// Handle lowering of 8-lane 32-bit floating point shuffles.
16674///
16675/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16676/// isn't available.
16678 const APInt &Zeroable, SDValue V1, SDValue V2,
16679 const X86Subtarget &Subtarget,
16680 SelectionDAG &DAG) {
16681 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16682 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16683 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16684
16685 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16686 Zeroable, Subtarget, DAG))
16687 return Blend;
16688
16689 // Check for being able to broadcast a single element.
16690 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16691 Subtarget, DAG))
16692 return Broadcast;
16693
16694 if (!Subtarget.hasAVX2()) {
16695 SmallVector<int> InLaneMask;
16696 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16697
16698 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16699 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16700 /*SimpleOnly*/ true))
16701 return R;
16702 }
16703 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16704 Zeroable, Subtarget, DAG))
16705 return DAG.getBitcast(MVT::v8f32, ZExt);
16706
16707 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16708 // options to efficiently lower the shuffle.
16709 SmallVector<int, 4> RepeatedMask;
16710 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16711 assert(RepeatedMask.size() == 4 &&
16712 "Repeated masks must be half the mask width!");
16713
16714 // Use even/odd duplicate instructions for masks that match their pattern.
16715 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16716 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16717 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16718 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16719
16720 if (V2.isUndef())
16721 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16722 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16723
16724 // Use dedicated unpack instructions for masks that match their pattern.
16725 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16726 return V;
16727
16728 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16729 // have already handled any direct blends.
16730 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16731 }
16732
16733 // Try to create an in-lane repeating shuffle mask and then shuffle the
16734 // results into the target lanes.
16736 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16737 return V;
16738
16739 // If we have a single input shuffle with different shuffle patterns in the
16740 // two 128-bit lanes use the variable mask to VPERMILPS.
16741 if (V2.isUndef()) {
16742 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16743 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16744 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16745 }
16746 if (Subtarget.hasAVX2()) {
16747 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16748 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16749 }
16750 // Otherwise, fall back.
16751 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16752 DAG, Subtarget);
16753 }
16754
16755 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16756 // shuffle.
16758 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16759 return Result;
16760
16761 // If we have VLX support, we can use VEXPAND.
16762 if (Subtarget.hasVLX())
16763 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16764 Zeroable, Subtarget, DAG))
16765 return V;
16766
16767 // Try to match an interleave of two v8f32s and lower them as unpck and
16768 // permutes using ymms. This needs to go before we try to split the vectors.
16769 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16770 if ((Subtarget.hasAVX2() ||
16773 !Subtarget.hasAVX512())
16774 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16775 Mask, DAG))
16776 return V;
16777
16778 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16779 // since after split we get a more efficient code using vpunpcklwd and
16780 // vpunpckhwd instrs than vblend.
16781 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16782 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16783 Subtarget, DAG);
16784
16785 // If we have AVX2 then we always want to lower with a blend because at v8 we
16786 // can fully permute the elements.
16787 if (Subtarget.hasAVX2())
16788 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16789 Zeroable, Subtarget, DAG);
16790
16791 // Otherwise fall back on generic lowering.
16792 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16793 Subtarget, DAG);
16794}
16795
16796/// Handle lowering of 8-lane 32-bit integer shuffles.
16797///
16798/// This routine is only called when we have AVX2 and thus a reasonable
16799/// instruction set for v8i32 shuffling..
16801 const APInt &Zeroable, SDValue V1, SDValue V2,
16802 const X86Subtarget &Subtarget,
16803 SelectionDAG &DAG) {
16804 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16805 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16806 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16807 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16808
16809 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16810
16811 // Whenever we can lower this as a zext, that instruction is strictly faster
16812 // than any alternative. It also allows us to fold memory operands into the
16813 // shuffle in many cases.
16814 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16815 Zeroable, Subtarget, DAG))
16816 return ZExt;
16817
16818 // Try to match an interleave of two v8i32s and lower them as unpck and
16819 // permutes using ymms. This needs to go before we try to split the vectors.
16820 if (!Subtarget.hasAVX512())
16821 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16822 Mask, DAG))
16823 return V;
16824
16825 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16826 // since after split we get a more efficient code than vblend by using
16827 // vpunpcklwd and vpunpckhwd instrs.
16828 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16829 !Subtarget.hasAVX512())
16830 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16831 Subtarget, DAG);
16832
16833 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16834 Zeroable, Subtarget, DAG))
16835 return Blend;
16836
16837 // Check for being able to broadcast a single element.
16838 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16839 Subtarget, DAG))
16840 return Broadcast;
16841
16842 // Try to use shift instructions if fast.
16843 if (Subtarget.preferLowerShuffleAsShift()) {
16844 if (SDValue Shift =
16845 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16846 Subtarget, DAG, /*BitwiseOnly*/ true))
16847 return Shift;
16848 if (NumV2Elements == 0)
16849 if (SDValue Rotate =
16850 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16851 return Rotate;
16852 }
16853
16854 // If the shuffle mask is repeated in each 128-bit lane we can use more
16855 // efficient instructions that mirror the shuffles across the two 128-bit
16856 // lanes.
16857 SmallVector<int, 4> RepeatedMask;
16858 bool Is128BitLaneRepeatedShuffle =
16859 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16860 if (Is128BitLaneRepeatedShuffle) {
16861 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16862 if (V2.isUndef())
16863 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16864 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16865
16866 // Use dedicated unpack instructions for masks that match their pattern.
16867 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16868 return V;
16869 }
16870
16871 // Try to use shift instructions.
16872 if (SDValue Shift =
16873 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16874 DAG, /*BitwiseOnly*/ false))
16875 return Shift;
16876
16877 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16878 if (SDValue Rotate =
16879 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16880 return Rotate;
16881
16882 // If we have VLX support, we can use VALIGN or EXPAND.
16883 if (Subtarget.hasVLX()) {
16884 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16885 Zeroable, Subtarget, DAG))
16886 return Rotate;
16887
16888 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16889 Zeroable, Subtarget, DAG))
16890 return V;
16891 }
16892
16893 // Try to use byte rotation instructions.
16894 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16895 Subtarget, DAG))
16896 return Rotate;
16897
16898 // Try to create an in-lane repeating shuffle mask and then shuffle the
16899 // results into the target lanes.
16901 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16902 return V;
16903
16904 if (V2.isUndef()) {
16905 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16906 // because that should be faster than the variable permute alternatives.
16907 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16908 return V;
16909
16910 // If the shuffle patterns aren't repeated but it's a single input, directly
16911 // generate a cross-lane VPERMD instruction.
16912 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16913 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16914 }
16915
16916 // Assume that a single SHUFPS is faster than an alternative sequence of
16917 // multiple instructions (even if the CPU has a domain penalty).
16918 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16919 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16920 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16921 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16922 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16923 CastV1, CastV2, DAG);
16924 return DAG.getBitcast(MVT::v8i32, ShufPS);
16925 }
16926
16927 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16928 // shuffle.
16930 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16931 return Result;
16932
16933 // Otherwise fall back on generic blend lowering.
16934 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16935 Zeroable, Subtarget, DAG);
16936}
16937
16938/// Handle lowering of 16-lane 16-bit integer shuffles.
16939///
16940/// This routine is only called when we have AVX2 and thus a reasonable
16941/// instruction set for v16i16 shuffling..
16943 const APInt &Zeroable, SDValue V1, SDValue V2,
16944 const X86Subtarget &Subtarget,
16945 SelectionDAG &DAG) {
16946 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16947 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16948 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16949 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16950
16951 // Whenever we can lower this as a zext, that instruction is strictly faster
16952 // than any alternative. It also allows us to fold memory operands into the
16953 // shuffle in many cases.
16955 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16956 return ZExt;
16957
16958 // Check for being able to broadcast a single element.
16959 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16960 Subtarget, DAG))
16961 return Broadcast;
16962
16963 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16964 Zeroable, Subtarget, DAG))
16965 return Blend;
16966
16967 // Use dedicated unpack instructions for masks that match their pattern.
16968 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16969 return V;
16970
16971 // Use dedicated pack instructions for masks that match their pattern.
16972 if (SDValue V =
16973 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16974 return V;
16975
16976 // Try to use lower using a truncation.
16977 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16978 Subtarget, DAG))
16979 return V;
16980
16981 // Try to use shift instructions.
16982 if (SDValue Shift =
16983 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16984 Subtarget, DAG, /*BitwiseOnly*/ false))
16985 return Shift;
16986
16987 // Try to use byte rotation instructions.
16988 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16989 Subtarget, DAG))
16990 return Rotate;
16991
16992 // Try to create an in-lane repeating shuffle mask and then shuffle the
16993 // results into the target lanes.
16995 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16996 return V;
16997
16998 if (V2.isUndef()) {
16999 // Try to use bit rotation instructions.
17000 if (SDValue Rotate =
17001 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17002 return Rotate;
17003
17004 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17005 // because that should be faster than the variable permute alternatives.
17006 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17007 return V;
17008
17009 // There are no generalized cross-lane shuffle operations available on i16
17010 // element types.
17011 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17013 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17014 return V;
17015
17016 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17017 DAG, Subtarget);
17018 }
17019
17020 SmallVector<int, 8> RepeatedMask;
17021 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17022 // As this is a single-input shuffle, the repeated mask should be
17023 // a strictly valid v8i16 mask that we can pass through to the v8i16
17024 // lowering to handle even the v16 case.
17026 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17027 }
17028 }
17029
17030 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17031 Zeroable, Subtarget, DAG))
17032 return PSHUFB;
17033
17034 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17035 if (Subtarget.hasBWI())
17036 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17037
17038 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17039 // shuffle.
17041 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17042 return Result;
17043
17044 // Try to permute the lanes and then use a per-lane permute.
17046 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17047 return V;
17048
17049 // Try to match an interleave of two v16i16s and lower them as unpck and
17050 // permutes using ymms.
17051 if (!Subtarget.hasAVX512())
17052 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17053 Mask, DAG))
17054 return V;
17055
17056 // Otherwise fall back on generic lowering.
17057 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17058 Subtarget, DAG);
17059}
17060
17061/// Handle lowering of 32-lane 8-bit integer shuffles.
17062///
17063/// This routine is only called when we have AVX2 and thus a reasonable
17064/// instruction set for v32i8 shuffling..
17066 const APInt &Zeroable, SDValue V1, SDValue V2,
17067 const X86Subtarget &Subtarget,
17068 SelectionDAG &DAG) {
17069 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17070 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17071 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17072 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17073
17074 // Whenever we can lower this as a zext, that instruction is strictly faster
17075 // than any alternative. It also allows us to fold memory operands into the
17076 // shuffle in many cases.
17077 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17078 Zeroable, Subtarget, DAG))
17079 return ZExt;
17080
17081 // Check for being able to broadcast a single element.
17082 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17083 Subtarget, DAG))
17084 return Broadcast;
17085
17086 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17087 Zeroable, Subtarget, DAG))
17088 return Blend;
17089
17090 // Use dedicated unpack instructions for masks that match their pattern.
17091 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17092 return V;
17093
17094 // Use dedicated pack instructions for masks that match their pattern.
17095 if (SDValue V =
17096 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17097 return V;
17098
17099 // Try to use lower using a truncation.
17100 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17101 Subtarget, DAG))
17102 return V;
17103
17104 // Try to use shift instructions.
17105 if (SDValue Shift =
17106 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17107 DAG, /*BitwiseOnly*/ false))
17108 return Shift;
17109
17110 // Try to use byte rotation instructions.
17111 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17112 Subtarget, DAG))
17113 return Rotate;
17114
17115 // Try to use bit rotation instructions.
17116 if (V2.isUndef())
17117 if (SDValue Rotate =
17118 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17119 return Rotate;
17120
17121 // Try to create an in-lane repeating shuffle mask and then shuffle the
17122 // results into the target lanes.
17124 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17125 return V;
17126
17127 // There are no generalized cross-lane shuffle operations available on i8
17128 // element types.
17129 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17130 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17131 // because that should be faster than the variable permute alternatives.
17132 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17133 return V;
17134
17136 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17137 return V;
17138
17139 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17140 DAG, Subtarget);
17141 }
17142
17143 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17144 Zeroable, Subtarget, DAG))
17145 return PSHUFB;
17146
17147 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17148 if (Subtarget.hasVBMI())
17149 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17150
17151 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17152 // shuffle.
17154 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17155 return Result;
17156
17157 // Try to permute the lanes and then use a per-lane permute.
17159 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17160 return V;
17161
17162 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17163 // by zeroable elements in the remaining 24 elements. Turn this into two
17164 // vmovqb instructions shuffled together.
17165 if (Subtarget.hasVLX())
17166 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17167 Mask, Zeroable, DAG))
17168 return V;
17169
17170 // Try to match an interleave of two v32i8s and lower them as unpck and
17171 // permutes using ymms.
17172 if (!Subtarget.hasAVX512())
17173 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17174 Mask, DAG))
17175 return V;
17176
17177 // Otherwise fall back on generic lowering.
17178 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17179 Subtarget, DAG);
17180}
17181
17182/// High-level routine to lower various 256-bit x86 vector shuffles.
17183///
17184/// This routine either breaks down the specific type of a 256-bit x86 vector
17185/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17186/// together based on the available instructions.
17188 SDValue V1, SDValue V2, const APInt &Zeroable,
17189 const X86Subtarget &Subtarget,
17190 SelectionDAG &DAG) {
17191 // If we have a single input to the zero element, insert that into V1 if we
17192 // can do so cheaply.
17193 int NumElts = VT.getVectorNumElements();
17194 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17195
17196 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17198 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17199 return Insertion;
17200
17201 // Handle special cases where the lower or upper half is UNDEF.
17202 if (SDValue V =
17203 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17204 return V;
17205
17206 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17207 // can check for those subtargets here and avoid much of the subtarget
17208 // querying in the per-vector-type lowering routines. With AVX1 we have
17209 // essentially *zero* ability to manipulate a 256-bit vector with integer
17210 // types. Since we'll use floating point types there eventually, just
17211 // immediately cast everything to a float and operate entirely in that domain.
17212 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17213 int ElementBits = VT.getScalarSizeInBits();
17214 if (ElementBits < 32) {
17215 // No floating point type available, if we can't use the bit operations
17216 // for masking/blending then decompose into 128-bit vectors.
17217 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17218 Subtarget, DAG))
17219 return V;
17220 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17221 return V;
17222 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17223 }
17224
17225 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17227 V1 = DAG.getBitcast(FpVT, V1);
17228 V2 = DAG.getBitcast(FpVT, V2);
17229 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17230 }
17231
17232 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17233 V1 = DAG.getBitcast(MVT::v16i16, V1);
17234 V2 = DAG.getBitcast(MVT::v16i16, V2);
17235 return DAG.getBitcast(VT,
17236 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17237 }
17238
17239 switch (VT.SimpleTy) {
17240 case MVT::v4f64:
17241 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17242 case MVT::v4i64:
17243 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17244 case MVT::v8f32:
17245 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17246 case MVT::v8i32:
17247 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17248 case MVT::v16i16:
17249 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17250 case MVT::v32i8:
17251 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17252
17253 default:
17254 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17255 }
17256}
17257
17258/// Try to lower a vector shuffle as a 128-bit shuffles.
17260 const APInt &Zeroable, SDValue V1, SDValue V2,
17261 const X86Subtarget &Subtarget,
17262 SelectionDAG &DAG) {
17263 assert(VT.getScalarSizeInBits() == 64 &&
17264 "Unexpected element type size for 128bit shuffle.");
17265
17266 // To handle 256 bit vector requires VLX and most probably
17267 // function lowerV2X128VectorShuffle() is better solution.
17268 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17269
17270 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17271 SmallVector<int, 4> Widened128Mask;
17272 if (!canWidenShuffleElements(Mask, Widened128Mask))
17273 return SDValue();
17274 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17275
17276 // Try to use an insert into a zero vector.
17277 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17278 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17279 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17280 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17281 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17282 DAG.getVectorIdxConstant(0, DL));
17283 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17284 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17285 DAG.getVectorIdxConstant(0, DL));
17286 }
17287
17288 // Check for patterns which can be matched with a single insert of a 256-bit
17289 // subvector.
17290 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17291 if (OnlyUsesV1 ||
17292 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17293 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17294 SDValue SubVec =
17295 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17296 DAG.getVectorIdxConstant(0, DL));
17297 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17298 DAG.getVectorIdxConstant(4, DL));
17299 }
17300
17301 // See if this is an insertion of the lower 128-bits of V2 into V1.
17302 bool IsInsert = true;
17303 int V2Index = -1;
17304 for (int i = 0; i < 4; ++i) {
17305 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17306 if (Widened128Mask[i] < 0)
17307 continue;
17308
17309 // Make sure all V1 subvectors are in place.
17310 if (Widened128Mask[i] < 4) {
17311 if (Widened128Mask[i] != i) {
17312 IsInsert = false;
17313 break;
17314 }
17315 } else {
17316 // Make sure we only have a single V2 index and its the lowest 128-bits.
17317 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17318 IsInsert = false;
17319 break;
17320 }
17321 V2Index = i;
17322 }
17323 }
17324 if (IsInsert && V2Index >= 0) {
17325 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17326 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17327 DAG.getVectorIdxConstant(0, DL));
17328 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17329 }
17330
17331 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17332 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17333 // possible we at least ensure the lanes stay sequential to help later
17334 // combines.
17335 SmallVector<int, 2> Widened256Mask;
17336 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17337 Widened128Mask.clear();
17338 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17339 }
17340
17341 // Try to lower to vshuf64x2/vshuf32x4.
17342 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17343 int PermMask[4] = {-1, -1, -1, -1};
17344 // Ensure elements came from the same Op.
17345 for (int i = 0; i < 4; ++i) {
17346 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17347 if (Widened128Mask[i] < 0)
17348 continue;
17349
17350 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17351 unsigned OpIndex = i / 2;
17352 if (Ops[OpIndex].isUndef())
17353 Ops[OpIndex] = Op;
17354 else if (Ops[OpIndex] != Op)
17355 return SDValue();
17356
17357 PermMask[i] = Widened128Mask[i] % 4;
17358 }
17359
17360 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17361 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17362}
17363
17364/// Handle lowering of 8-lane 64-bit floating point shuffles.
17366 const APInt &Zeroable, SDValue V1, SDValue V2,
17367 const X86Subtarget &Subtarget,
17368 SelectionDAG &DAG) {
17369 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17370 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17371 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17372
17373 if (V2.isUndef()) {
17374 // Use low duplicate instructions for masks that match their pattern.
17375 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17376 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17377
17378 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17379 // Non-half-crossing single input shuffles can be lowered with an
17380 // interleaved permutation.
17381 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17382 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17383 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17384 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17385 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17386 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17387 }
17388
17389 SmallVector<int, 4> RepeatedMask;
17390 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17391 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17392 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17393 }
17394
17395 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17396 V2, Subtarget, DAG))
17397 return Shuf128;
17398
17399 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17400 return Unpck;
17401
17402 // Check if the blend happens to exactly fit that of SHUFPD.
17403 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17404 Zeroable, Subtarget, DAG))
17405 return Op;
17406
17407 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17408 Subtarget, DAG))
17409 return V;
17410
17411 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17412 Zeroable, Subtarget, DAG))
17413 return Blend;
17414
17415 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17416}
17417
17418/// Handle lowering of 16-lane 32-bit floating point shuffles.
17420 const APInt &Zeroable, SDValue V1, SDValue V2,
17421 const X86Subtarget &Subtarget,
17422 SelectionDAG &DAG) {
17423 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17424 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17425 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17426
17427 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17428 // options to efficiently lower the shuffle.
17429 SmallVector<int, 4> RepeatedMask;
17430 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17431 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17432
17433 // Use even/odd duplicate instructions for masks that match their pattern.
17434 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17435 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17436 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17437 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17438
17439 if (V2.isUndef())
17440 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17441 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17442
17443 // Use dedicated unpack instructions for masks that match their pattern.
17444 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17445 return V;
17446
17447 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17448 Zeroable, Subtarget, DAG))
17449 return Blend;
17450
17451 // Otherwise, fall back to a SHUFPS sequence.
17452 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17453 }
17454
17455 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17456 Zeroable, Subtarget, DAG))
17457 return Blend;
17458
17460 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17461 return DAG.getBitcast(MVT::v16f32, ZExt);
17462
17463 // Try to create an in-lane repeating shuffle mask and then shuffle the
17464 // results into the target lanes.
17466 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17467 return V;
17468
17469 // If we have a single input shuffle with different shuffle patterns in the
17470 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17471 if (V2.isUndef() &&
17472 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17473 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17474 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17475 }
17476
17477 // If we have AVX512F support, we can use VEXPAND.
17478 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17479 Zeroable, Subtarget, DAG))
17480 return V;
17481
17482 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17483}
17484
17485/// Handle lowering of 8-lane 64-bit integer shuffles.
17487 const APInt &Zeroable, SDValue V1, SDValue V2,
17488 const X86Subtarget &Subtarget,
17489 SelectionDAG &DAG) {
17490 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17491 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17492 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17493
17494 // Try to use shift instructions if fast.
17495 if (Subtarget.preferLowerShuffleAsShift())
17496 if (SDValue Shift =
17497 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17498 Subtarget, DAG, /*BitwiseOnly*/ true))
17499 return Shift;
17500
17501 if (V2.isUndef()) {
17502 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17503 // can use lower latency instructions that will operate on all four
17504 // 128-bit lanes.
17505 SmallVector<int, 2> Repeated128Mask;
17506 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17507 SmallVector<int, 4> PSHUFDMask;
17508 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17509 return DAG.getBitcast(
17510 MVT::v8i64,
17511 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17512 DAG.getBitcast(MVT::v16i32, V1),
17513 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17514 }
17515
17516 SmallVector<int, 4> Repeated256Mask;
17517 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17518 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17519 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17520 }
17521
17522 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17523 V2, Subtarget, DAG))
17524 return Shuf128;
17525
17526 // Try to use shift instructions.
17527 if (SDValue Shift =
17528 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17529 DAG, /*BitwiseOnly*/ false))
17530 return Shift;
17531
17532 // Try to use VALIGN.
17533 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17534 Zeroable, Subtarget, DAG))
17535 return Rotate;
17536
17537 // Try to use PALIGNR.
17538 if (Subtarget.hasBWI())
17539 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17540 Subtarget, DAG))
17541 return Rotate;
17542
17543 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17544 return Unpck;
17545
17546 // If we have AVX512F support, we can use VEXPAND.
17547 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17548 Subtarget, DAG))
17549 return V;
17550
17551 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17552 Zeroable, Subtarget, DAG))
17553 return Blend;
17554
17555 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17556}
17557
17558/// Handle lowering of 16-lane 32-bit integer shuffles.
17560 const APInt &Zeroable, SDValue V1, SDValue V2,
17561 const X86Subtarget &Subtarget,
17562 SelectionDAG &DAG) {
17563 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17564 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17565 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17566
17567 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17568
17569 // Whenever we can lower this as a zext, that instruction is strictly faster
17570 // than any alternative. It also allows us to fold memory operands into the
17571 // shuffle in many cases.
17573 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17574 return ZExt;
17575
17576 // Try to use shift instructions if fast.
17577 if (Subtarget.preferLowerShuffleAsShift()) {
17578 if (SDValue Shift =
17579 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17580 Subtarget, DAG, /*BitwiseOnly*/ true))
17581 return Shift;
17582 if (NumV2Elements == 0)
17583 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17584 Subtarget, DAG))
17585 return Rotate;
17586 }
17587
17588 // If the shuffle mask is repeated in each 128-bit lane we can use more
17589 // efficient instructions that mirror the shuffles across the four 128-bit
17590 // lanes.
17591 SmallVector<int, 4> RepeatedMask;
17592 bool Is128BitLaneRepeatedShuffle =
17593 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17594 if (Is128BitLaneRepeatedShuffle) {
17595 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17596 if (V2.isUndef())
17597 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17598 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17599
17600 // Use dedicated unpack instructions for masks that match their pattern.
17601 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17602 return V;
17603 }
17604
17605 // Try to use shift instructions.
17606 if (SDValue Shift =
17607 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17608 Subtarget, DAG, /*BitwiseOnly*/ false))
17609 return Shift;
17610
17611 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17612 if (SDValue Rotate =
17613 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17614 return Rotate;
17615
17616 // Try to use VALIGN.
17617 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17618 Zeroable, Subtarget, DAG))
17619 return Rotate;
17620
17621 // Try to use byte rotation instructions.
17622 if (Subtarget.hasBWI())
17623 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17624 Subtarget, DAG))
17625 return Rotate;
17626
17627 // Assume that a single SHUFPS is faster than using a permv shuffle.
17628 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17629 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17630 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17631 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17632 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17633 CastV1, CastV2, DAG);
17634 return DAG.getBitcast(MVT::v16i32, ShufPS);
17635 }
17636
17637 // Try to create an in-lane repeating shuffle mask and then shuffle the
17638 // results into the target lanes.
17640 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17641 return V;
17642
17643 // If we have AVX512F support, we can use VEXPAND.
17644 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17645 Zeroable, Subtarget, DAG))
17646 return V;
17647
17648 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17649 Zeroable, Subtarget, DAG))
17650 return Blend;
17651
17652 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17653}
17654
17655/// Handle lowering of 32-lane 16-bit integer shuffles.
17657 const APInt &Zeroable, SDValue V1, SDValue V2,
17658 const X86Subtarget &Subtarget,
17659 SelectionDAG &DAG) {
17660 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17661 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17662 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17663 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17664
17665 // Whenever we can lower this as a zext, that instruction is strictly faster
17666 // than any alternative. It also allows us to fold memory operands into the
17667 // shuffle in many cases.
17669 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17670 return ZExt;
17671
17672 // Use dedicated unpack instructions for masks that match their pattern.
17673 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17674 return V;
17675
17676 // Use dedicated pack instructions for masks that match their pattern.
17677 if (SDValue V =
17678 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17679 return V;
17680
17681 // Try to use shift instructions.
17682 if (SDValue Shift =
17683 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17684 Subtarget, DAG, /*BitwiseOnly*/ false))
17685 return Shift;
17686
17687 // Try to use byte rotation instructions.
17688 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17689 Subtarget, DAG))
17690 return Rotate;
17691
17692 if (V2.isUndef()) {
17693 // Try to use bit rotation instructions.
17694 if (SDValue Rotate =
17695 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17696 return Rotate;
17697
17698 SmallVector<int, 8> RepeatedMask;
17699 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17700 // As this is a single-input shuffle, the repeated mask should be
17701 // a strictly valid v8i16 mask that we can pass through to the v8i16
17702 // lowering to handle even the v32 case.
17703 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17704 RepeatedMask, Subtarget, DAG);
17705 }
17706 }
17707
17708 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17709 Zeroable, Subtarget, DAG))
17710 return Blend;
17711
17712 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17713 Zeroable, Subtarget, DAG))
17714 return PSHUFB;
17715
17716 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17717 // shuffle.
17718 if (!V2.isUndef())
17720 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17721 return Result;
17722
17723 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17724}
17725
17726/// Handle lowering of 64-lane 8-bit integer shuffles.
17728 const APInt &Zeroable, SDValue V1, SDValue V2,
17729 const X86Subtarget &Subtarget,
17730 SelectionDAG &DAG) {
17731 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17732 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17733 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17734 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17735
17736 // Whenever we can lower this as a zext, that instruction is strictly faster
17737 // than any alternative. It also allows us to fold memory operands into the
17738 // shuffle in many cases.
17740 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17741 return ZExt;
17742
17743 // Use dedicated unpack instructions for masks that match their pattern.
17744 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17745 return V;
17746
17747 // Use dedicated pack instructions for masks that match their pattern.
17748 if (SDValue V =
17749 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17750 return V;
17751
17752 // Try to use shift instructions.
17753 if (SDValue Shift =
17754 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17755 DAG, /*BitwiseOnly*/ false))
17756 return Shift;
17757
17758 // Try to use byte rotation instructions.
17759 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17760 Subtarget, DAG))
17761 return Rotate;
17762
17763 // Try to use bit rotation instructions.
17764 if (V2.isUndef())
17765 if (SDValue Rotate =
17766 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17767 return Rotate;
17768
17769 // Lower as AND if possible.
17770 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17771 Zeroable, Subtarget, DAG))
17772 return Masked;
17773
17774 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17775 Zeroable, Subtarget, DAG))
17776 return PSHUFB;
17777
17778 // Try to create an in-lane repeating shuffle mask and then shuffle the
17779 // results into the target lanes.
17781 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17782 return V;
17783
17785 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17786 return Result;
17787
17788 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17789 Zeroable, Subtarget, DAG))
17790 return Blend;
17791
17792 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17793 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17794 // PALIGNR will be cheaper than the second PSHUFB+OR.
17795 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17796 Mask, Subtarget, DAG))
17797 return V;
17798
17799 // If we can't directly blend but can use PSHUFB, that will be better as it
17800 // can both shuffle and set up the inefficient blend.
17801 bool V1InUse, V2InUse;
17802 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17803 DAG, V1InUse, V2InUse);
17804 }
17805
17806 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17807 // shuffle.
17808 if (!V2.isUndef())
17810 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17811 return Result;
17812
17813 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17814 if (Subtarget.hasVBMI())
17815 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17816
17817 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17818}
17819
17820/// High-level routine to lower various 512-bit x86 vector shuffles.
17821///
17822/// This routine either breaks down the specific type of a 512-bit x86 vector
17823/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17824/// together based on the available instructions.
17826 MVT VT, SDValue V1, SDValue V2,
17827 const APInt &Zeroable,
17828 const X86Subtarget &Subtarget,
17829 SelectionDAG &DAG) {
17830 assert(Subtarget.hasAVX512() &&
17831 "Cannot lower 512-bit vectors w/ basic ISA!");
17832
17833 // If we have a single input to the zero element, insert that into V1 if we
17834 // can do so cheaply.
17835 int NumElts = Mask.size();
17836 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17837
17838 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17840 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17841 return Insertion;
17842
17843 // Handle special cases where the lower or upper half is UNDEF.
17844 if (SDValue V =
17845 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17846 return V;
17847
17848 // Check for being able to broadcast a single element.
17849 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17850 Subtarget, DAG))
17851 return Broadcast;
17852
17853 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17854 // Try using bit ops for masking and blending before falling back to
17855 // splitting.
17856 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17857 Subtarget, DAG))
17858 return V;
17859 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17860 return V;
17861
17862 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17863 }
17864
17865 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17866 if (!Subtarget.hasBWI())
17867 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17868 /*SimpleOnly*/ false);
17869
17870 V1 = DAG.getBitcast(MVT::v32i16, V1);
17871 V2 = DAG.getBitcast(MVT::v32i16, V2);
17872 return DAG.getBitcast(VT,
17873 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17874 }
17875
17876 // Dispatch to each element type for lowering. If we don't have support for
17877 // specific element type shuffles at 512 bits, immediately split them and
17878 // lower them. Each lowering routine of a given type is allowed to assume that
17879 // the requisite ISA extensions for that element type are available.
17880 switch (VT.SimpleTy) {
17881 case MVT::v8f64:
17882 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17883 case MVT::v16f32:
17884 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17885 case MVT::v8i64:
17886 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17887 case MVT::v16i32:
17888 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17889 case MVT::v32i16:
17890 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17891 case MVT::v64i8:
17892 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17893
17894 default:
17895 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17896 }
17897}
17898
17900 MVT VT, SDValue V1, SDValue V2,
17901 const X86Subtarget &Subtarget,
17902 SelectionDAG &DAG) {
17903 // Shuffle should be unary.
17904 if (!V2.isUndef())
17905 return SDValue();
17906
17907 int ShiftAmt = -1;
17908 int NumElts = Mask.size();
17909 for (int i = 0; i != NumElts; ++i) {
17910 int M = Mask[i];
17911 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17912 "Unexpected mask index.");
17913 if (M < 0)
17914 continue;
17915
17916 // The first non-undef element determines our shift amount.
17917 if (ShiftAmt < 0) {
17918 ShiftAmt = M - i;
17919 // Need to be shifting right.
17920 if (ShiftAmt <= 0)
17921 return SDValue();
17922 }
17923 // All non-undef elements must shift by the same amount.
17924 if (ShiftAmt != M - i)
17925 return SDValue();
17926 }
17927 assert(ShiftAmt >= 0 && "All undef?");
17928
17929 // Great we found a shift right.
17930 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17931 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17932 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17933 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17934 DAG.getVectorIdxConstant(0, DL));
17935}
17936
17937// Determine if this shuffle can be implemented with a KSHIFT instruction.
17938// Returns the shift amount if possible or -1 if not. This is a simplified
17939// version of matchShuffleAsShift.
17940static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17941 int MaskOffset, const APInt &Zeroable) {
17942 int Size = Mask.size();
17943
17944 auto CheckZeros = [&](int Shift, bool Left) {
17945 for (int j = 0; j < Shift; ++j)
17946 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17947 return false;
17948
17949 return true;
17950 };
17951
17952 auto MatchShift = [&](int Shift, bool Left) {
17953 unsigned Pos = Left ? Shift : 0;
17954 unsigned Low = Left ? 0 : Shift;
17955 unsigned Len = Size - Shift;
17956 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17957 };
17958
17959 for (int Shift = 1; Shift != Size; ++Shift)
17960 for (bool Left : {true, false})
17961 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17963 return Shift;
17964 }
17965
17966 return -1;
17967}
17968
17969
17970// Lower vXi1 vector shuffles.
17971// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17972// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17973// vector, shuffle and then truncate it back.
17975 MVT VT, SDValue V1, SDValue V2,
17976 const APInt &Zeroable,
17977 const X86Subtarget &Subtarget,
17978 SelectionDAG &DAG) {
17979 assert(Subtarget.hasAVX512() &&
17980 "Cannot lower 512-bit vectors w/o basic ISA!");
17981
17982 int NumElts = Mask.size();
17983 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17984
17985 // Try to recognize shuffles that are just padding a subvector with zeros.
17986 int SubvecElts = 0;
17987 int Src = -1;
17988 for (int i = 0; i != NumElts; ++i) {
17989 if (Mask[i] >= 0) {
17990 // Grab the source from the first valid mask. All subsequent elements need
17991 // to use this same source.
17992 if (Src < 0)
17993 Src = Mask[i] / NumElts;
17994 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17995 break;
17996 }
17997
17998 ++SubvecElts;
17999 }
18000 assert(SubvecElts != NumElts && "Identity shuffle?");
18001
18002 // Clip to a power 2.
18003 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18004
18005 // Make sure the number of zeroable bits in the top at least covers the bits
18006 // not covered by the subvector.
18007 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18008 assert(Src >= 0 && "Expected a source!");
18009 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18010 SDValue Extract =
18011 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18012 DAG.getVectorIdxConstant(0, DL));
18013 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18014 DAG.getConstant(0, DL, VT), Extract,
18015 DAG.getVectorIdxConstant(0, DL));
18016 }
18017
18018 // Try a simple shift right with undef elements. Later we'll try with zeros.
18019 if (SDValue Shift =
18020 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18021 return Shift;
18022
18023 // Try to match KSHIFTs.
18024 unsigned Offset = 0;
18025 for (SDValue V : {V1, V2}) {
18026 unsigned Opcode;
18027 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18028 if (ShiftAmt >= 0) {
18029 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18030 MVT WideVT = Res.getSimpleValueType();
18031 // Widened right shifts need two shifts to ensure we shift in zeroes.
18032 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18033 int WideElts = WideVT.getVectorNumElements();
18034 // Shift left to put the original vector in the MSBs of the new size.
18035 Res =
18036 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18037 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18038 // Increase the shift amount to account for the left shift.
18039 ShiftAmt += WideElts - NumElts;
18040 }
18041
18042 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18043 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18044 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18045 DAG.getVectorIdxConstant(0, DL));
18046 }
18047 Offset += NumElts; // Increment for next iteration.
18048 }
18049
18050 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18051 // ops instead.
18052 // TODO: What other unary shuffles would benefit from this?
18053 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18054 SDValue Op0 = V1.getOperand(0);
18055 SDValue Op1 = V1.getOperand(1);
18057 EVT OpVT = Op0.getValueType();
18058 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18059 return DAG.getSetCC(
18060 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18061 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18062 }
18063
18064 MVT ExtVT;
18065 switch (VT.SimpleTy) {
18066 default:
18067 llvm_unreachable("Expected a vector of i1 elements");
18068 case MVT::v2i1:
18069 ExtVT = MVT::v2i64;
18070 break;
18071 case MVT::v4i1:
18072 ExtVT = MVT::v4i32;
18073 break;
18074 case MVT::v8i1:
18075 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18076 // shuffle.
18077 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18078 break;
18079 case MVT::v16i1:
18080 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18081 // 256-bit operation available.
18082 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18083 break;
18084 case MVT::v32i1:
18085 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18086 // 256-bit operation available.
18087 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18088 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18089 break;
18090 case MVT::v64i1:
18091 // Fall back to scalarization. FIXME: We can do better if the shuffle
18092 // can be partitioned cleanly.
18093 if (!Subtarget.useBWIRegs())
18094 return SDValue();
18095 ExtVT = MVT::v64i8;
18096 break;
18097 }
18098
18099 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18100 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18101
18102 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18103 // i1 was sign extended we can use X86ISD::CVT2MASK.
18104 int NumElems = VT.getVectorNumElements();
18105 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18106 (Subtarget.hasDQI() && (NumElems < 32)))
18107 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18108 Shuffle, ISD::SETGT);
18109
18110 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18111}
18112
18113/// Helper function that returns true if the shuffle mask should be
18114/// commuted to improve canonicalization.
18116 int NumElements = Mask.size();
18117
18118 int NumV1Elements = 0, NumV2Elements = 0;
18119 for (int M : Mask)
18120 if (M < 0)
18121 continue;
18122 else if (M < NumElements)
18123 ++NumV1Elements;
18124 else
18125 ++NumV2Elements;
18126
18127 // Commute the shuffle as needed such that more elements come from V1 than
18128 // V2. This allows us to match the shuffle pattern strictly on how many
18129 // elements come from V1 without handling the symmetric cases.
18130 if (NumV2Elements > NumV1Elements)
18131 return true;
18132
18133 assert(NumV1Elements > 0 && "No V1 indices");
18134
18135 if (NumV2Elements == 0)
18136 return false;
18137
18138 // When the number of V1 and V2 elements are the same, try to minimize the
18139 // number of uses of V2 in the low half of the vector. When that is tied,
18140 // ensure that the sum of indices for V1 is equal to or lower than the sum
18141 // indices for V2. When those are equal, try to ensure that the number of odd
18142 // indices for V1 is lower than the number of odd indices for V2.
18143 if (NumV1Elements == NumV2Elements) {
18144 int LowV1Elements = 0, LowV2Elements = 0;
18145 for (int M : Mask.slice(0, NumElements / 2))
18146 if (M >= NumElements)
18147 ++LowV2Elements;
18148 else if (M >= 0)
18149 ++LowV1Elements;
18150 if (LowV2Elements > LowV1Elements)
18151 return true;
18152 if (LowV2Elements == LowV1Elements) {
18153 int SumV1Indices = 0, SumV2Indices = 0;
18154 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18155 if (Mask[i] >= NumElements)
18156 SumV2Indices += i;
18157 else if (Mask[i] >= 0)
18158 SumV1Indices += i;
18159 if (SumV2Indices < SumV1Indices)
18160 return true;
18161 if (SumV2Indices == SumV1Indices) {
18162 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18163 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18164 if (Mask[i] >= NumElements)
18165 NumV2OddIndices += i % 2;
18166 else if (Mask[i] >= 0)
18167 NumV1OddIndices += i % 2;
18168 if (NumV2OddIndices < NumV1OddIndices)
18169 return true;
18170 }
18171 }
18172 }
18173
18174 return false;
18175}
18176
18178 const X86Subtarget &Subtarget) {
18179 if (!Subtarget.hasAVX512())
18180 return false;
18181
18182 if (!V.getValueType().isSimple())
18183 return false;
18184
18185 MVT VT = V.getSimpleValueType().getScalarType();
18186 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18187 return false;
18188
18189 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18190 // are preferable to blendw/blendvb/masked-mov.
18191 if ((VT == MVT::i16 || VT == MVT::i8) &&
18192 V.getSimpleValueType().getSizeInBits() < 512)
18193 return false;
18194
18195 auto HasMaskOperation = [&](SDValue V) {
18196 // TODO: Currently we only check limited opcode. We probably extend
18197 // it to all binary operation by checking TLI.isBinOp().
18198 switch (V->getOpcode()) {
18199 default:
18200 return false;
18201 case ISD::ADD:
18202 case ISD::SUB:
18203 case ISD::AND:
18204 case ISD::XOR:
18205 case ISD::OR:
18206 case ISD::SMAX:
18207 case ISD::SMIN:
18208 case ISD::UMAX:
18209 case ISD::UMIN:
18210 case ISD::ABS:
18211 case ISD::SHL:
18212 case ISD::SRL:
18213 case ISD::SRA:
18214 case ISD::MUL:
18215 break;
18216 }
18217 if (!V->hasOneUse())
18218 return false;
18219
18220 return true;
18221 };
18222
18223 if (HasMaskOperation(V))
18224 return true;
18225
18226 return false;
18227}
18228
18229// Forward declaration.
18232 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18233 const X86Subtarget &Subtarget);
18234
18235 /// Top-level lowering for x86 vector shuffles.
18236///
18237/// This handles decomposition, canonicalization, and lowering of all x86
18238/// vector shuffles. Most of the specific lowering strategies are encapsulated
18239/// above in helper routines. The canonicalization attempts to widen shuffles
18240/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18241/// s.t. only one of the two inputs needs to be tested, etc.
18243 SelectionDAG &DAG) {
18245 ArrayRef<int> OrigMask = SVOp->getMask();
18246 SDValue V1 = Op.getOperand(0);
18247 SDValue V2 = Op.getOperand(1);
18248 MVT VT = Op.getSimpleValueType();
18249 int NumElements = VT.getVectorNumElements();
18250 SDLoc DL(Op);
18251 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18252
18253 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18254 "Can't lower MMX shuffles");
18255
18256 bool V1IsUndef = V1.isUndef();
18257 bool V2IsUndef = V2.isUndef();
18258 if (V1IsUndef && V2IsUndef)
18259 return DAG.getUNDEF(VT);
18260
18261 // When we create a shuffle node we put the UNDEF node to second operand,
18262 // but in some cases the first operand may be transformed to UNDEF.
18263 // In this case we should just commute the node.
18264 if (V1IsUndef)
18265 return DAG.getCommutedVectorShuffle(*SVOp);
18266
18267 // Check for non-undef masks pointing at an undef vector and make the masks
18268 // undef as well. This makes it easier to match the shuffle based solely on
18269 // the mask.
18270 if (V2IsUndef &&
18271 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18272 SmallVector<int, 8> NewMask(OrigMask);
18273 for (int &M : NewMask)
18274 if (M >= NumElements)
18275 M = -1;
18276 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18277 }
18278
18279 // Check for illegal shuffle mask element index values.
18280 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18281 (void)MaskUpperLimit;
18282 assert(llvm::all_of(OrigMask,
18283 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18284 "Out of bounds shuffle index");
18285
18286 // We actually see shuffles that are entirely re-arrangements of a set of
18287 // zero inputs. This mostly happens while decomposing complex shuffles into
18288 // simple ones. Directly lower these as a buildvector of zeros.
18289 APInt KnownUndef, KnownZero;
18290 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18291
18292 APInt Zeroable = KnownUndef | KnownZero;
18293 if (Zeroable.isAllOnes())
18294 return getZeroVector(VT, Subtarget, DAG, DL);
18295
18296 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18297
18298 // Try to collapse shuffles into using a vector type with fewer elements but
18299 // wider element types. We cap this to not form integers or floating point
18300 // elements wider than 64 bits. It does not seem beneficial to form i128
18301 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18302 SmallVector<int, 16> WidenedMask;
18303 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18304 !canCombineAsMaskOperation(V1, Subtarget) &&
18305 !canCombineAsMaskOperation(V2, Subtarget) &&
18306 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18307 // Shuffle mask widening should not interfere with a broadcast opportunity
18308 // by obfuscating the operands with bitcasts.
18309 // TODO: Avoid lowering directly from this top-level function: make this
18310 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18311 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18312 Subtarget, DAG))
18313 return Broadcast;
18314
18315 MVT NewEltVT = VT.isFloatingPoint()
18318 int NewNumElts = NumElements / 2;
18319 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18320 // Make sure that the new vector type is legal. For example, v2f64 isn't
18321 // legal on SSE1.
18322 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18323 if (V2IsZero) {
18324 // Modify the new Mask to take all zeros from the all-zero vector.
18325 // Choose indices that are blend-friendly.
18326 bool UsedZeroVector = false;
18327 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18328 "V2's non-undef elements are used?!");
18329 for (int i = 0; i != NewNumElts; ++i)
18330 if (WidenedMask[i] == SM_SentinelZero) {
18331 WidenedMask[i] = i + NewNumElts;
18332 UsedZeroVector = true;
18333 }
18334 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18335 // some elements to be undef.
18336 if (UsedZeroVector)
18337 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18338 }
18339 V1 = DAG.getBitcast(NewVT, V1);
18340 V2 = DAG.getBitcast(NewVT, V2);
18341 return DAG.getBitcast(
18342 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18343 }
18344 }
18345
18346 SmallVector<SDValue> Ops = {V1, V2};
18347 SmallVector<int> Mask(OrigMask);
18348
18349 // Canonicalize the shuffle with any horizontal ops inputs.
18350 // NOTE: This may update Ops and Mask.
18352 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18353 return DAG.getBitcast(VT, HOp);
18354
18355 V1 = DAG.getBitcast(VT, Ops[0]);
18356 V2 = DAG.getBitcast(VT, Ops[1]);
18357 assert(NumElements == (int)Mask.size() &&
18358 "canonicalizeShuffleMaskWithHorizOp "
18359 "shouldn't alter the shuffle mask size");
18360
18361 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18362 // These will be materialized uniformly anyway, so make splat matching easier.
18363 // TODO: Allow all int constants?
18364 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18365 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18366 BitVector Undefs;
18367 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18368 if (Undefs.any() &&
18371 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18372 }
18373 }
18374 }
18375 return V;
18376 };
18377 V1 = CanonicalizeConstant(V1);
18378 V2 = CanonicalizeConstant(V2);
18379
18380 // Commute the shuffle if it will improve canonicalization.
18383 std::swap(V1, V2);
18384 }
18385
18386 // For each vector width, delegate to a specialized lowering routine.
18387 if (VT.is128BitVector())
18388 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18389
18390 if (VT.is256BitVector())
18391 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18392
18393 if (VT.is512BitVector())
18394 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18395
18396 if (Is1BitVector)
18397 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18398
18399 llvm_unreachable("Unimplemented!");
18400}
18401
18402// As legal vpcompress instructions depend on various AVX512 extensions, try to
18403// convert illegal vector sizes to legal ones to avoid expansion.
18405 SelectionDAG &DAG) {
18406 assert(Subtarget.hasAVX512() &&
18407 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18408
18409 SDLoc DL(Op);
18410 SDValue Vec = Op.getOperand(0);
18411 SDValue Mask = Op.getOperand(1);
18412 SDValue Passthru = Op.getOperand(2);
18413
18414 EVT VecVT = Vec.getValueType();
18415 EVT ElementVT = VecVT.getVectorElementType();
18416 unsigned NumElements = VecVT.getVectorNumElements();
18417 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18418 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18419
18420 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18421 // compressed as 512-bit vectors in AVX512F.
18422 if (NumVecBits != 128 && NumVecBits != 256)
18423 return SDValue();
18424
18425 if (NumElementBits == 32 || NumElementBits == 64) {
18426 unsigned NumLargeElements = 512 / NumElementBits;
18427 MVT LargeVecVT =
18428 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18429 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18430
18431 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18432 DAG, DL);
18433 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18434 Subtarget, DAG, DL);
18435 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18436 : widenSubVector(LargeVecVT, Passthru,
18437 /*ZeroNewElements=*/false,
18438 Subtarget, DAG, DL);
18439
18440 SDValue Compressed =
18441 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18443 DAG.getConstant(0, DL, MVT::i64));
18444 }
18445
18446 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18447 VecVT == MVT::v16i16) {
18448 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18449 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18450
18451 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18452 Passthru = Passthru.isUndef()
18453 ? DAG.getUNDEF(LargeVecVT)
18454 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18455
18456 SDValue Compressed =
18457 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18458 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18459 }
18460
18461 return SDValue();
18462}
18463
18464/// Try to lower a VSELECT instruction to a vector shuffle.
18466 const X86Subtarget &Subtarget,
18467 SelectionDAG &DAG) {
18468 SDValue Cond = Op.getOperand(0);
18469 SDValue LHS = Op.getOperand(1);
18470 SDValue RHS = Op.getOperand(2);
18471 MVT VT = Op.getSimpleValueType();
18472
18473 // Only non-legal VSELECTs reach this lowering, convert those into generic
18474 // shuffles and re-use the shuffle lowering path for blends.
18478 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18479 }
18480
18481 return SDValue();
18482}
18483
18484SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18485 SDValue Cond = Op.getOperand(0);
18486 SDValue LHS = Op.getOperand(1);
18487 SDValue RHS = Op.getOperand(2);
18488
18489 SDLoc dl(Op);
18490 MVT VT = Op.getSimpleValueType();
18491 if (isSoftF16(VT, Subtarget)) {
18492 MVT NVT = VT.changeVectorElementTypeToInteger();
18493 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18494 DAG.getBitcast(NVT, LHS),
18495 DAG.getBitcast(NVT, RHS)));
18496 }
18497
18498 // A vselect where all conditions and data are constants can be optimized into
18499 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18503 return SDValue();
18504
18505 // Try to lower this to a blend-style vector shuffle. This can handle all
18506 // constant condition cases.
18507 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18508 return BlendOp;
18509
18510 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18511 // with patterns on the mask registers on AVX-512.
18512 MVT CondVT = Cond.getSimpleValueType();
18513 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18514 if (CondEltSize == 1)
18515 return Op;
18516
18517 // Variable blends are only legal from SSE4.1 onward.
18518 if (!Subtarget.hasSSE41())
18519 return SDValue();
18520
18521 unsigned EltSize = VT.getScalarSizeInBits();
18522 unsigned NumElts = VT.getVectorNumElements();
18523
18524 // Expand v32i16/v64i8 without BWI.
18525 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18526 return SDValue();
18527
18528 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18529 // into an i1 condition so that we can use the mask-based 512-bit blend
18530 // instructions.
18531 if (VT.getSizeInBits() == 512) {
18532 // Build a mask by testing the condition against zero.
18533 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18534 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18535 DAG.getConstant(0, dl, CondVT),
18536 ISD::SETNE);
18537 // Now return a new VSELECT using the mask.
18538 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18539 }
18540
18541 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18542 if (CondEltSize != EltSize) {
18543 // If we don't have a sign splat, rely on the expansion.
18544 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18545 return SDValue();
18546
18547 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18548 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18549 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18550 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18551 }
18552
18553 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18554 // are free to split, then better to split before expanding the
18555 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18556 // TODO: This is very similar to narrowVectorSelect.
18557 // TODO: Add Load splitting to isFreeToSplitVector ?
18558 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18559 !Subtarget.hasXOP()) {
18560 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18561 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18562 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18563 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18564 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18565 if (FreeCond && (FreeLHS || FreeRHS))
18566 return splitVectorOp(Op, DAG, dl);
18567 }
18568
18569 // Only some types will be legal on some subtargets. If we can emit a legal
18570 // VSELECT-matching blend, return Op, and but if we need to expand, return
18571 // a null value.
18572 switch (VT.SimpleTy) {
18573 default:
18574 // Most of the vector types have blends past SSE4.1.
18575 return Op;
18576
18577 case MVT::v32i8:
18578 // The byte blends for AVX vectors were introduced only in AVX2.
18579 if (Subtarget.hasAVX2())
18580 return Op;
18581
18582 return SDValue();
18583
18584 case MVT::v8i16:
18585 case MVT::v16i16:
18586 case MVT::v8f16:
18587 case MVT::v16f16: {
18588 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18589 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18590 Cond = DAG.getBitcast(CastVT, Cond);
18591 LHS = DAG.getBitcast(CastVT, LHS);
18592 RHS = DAG.getBitcast(CastVT, RHS);
18593 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18594 return DAG.getBitcast(VT, Select);
18595 }
18596 }
18597}
18598
18600 MVT VT = Op.getSimpleValueType();
18601 SDValue Vec = Op.getOperand(0);
18602 SDValue Idx = Op.getOperand(1);
18603 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18604 SDLoc dl(Op);
18605
18607 return SDValue();
18608
18609 if (VT.getSizeInBits() == 8) {
18610 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18611 // we're going to zero extend the register or fold the store.
18614 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18615 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18616 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18617
18618 unsigned IdxVal = Idx->getAsZExtVal();
18619 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18620 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18621 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18622 }
18623
18624 if (VT == MVT::f32) {
18625 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18626 // the result back to FR32 register. It's only worth matching if the
18627 // result has a single use which is a store or a bitcast to i32. And in
18628 // the case of a store, it's not worth it if the index is a constant 0,
18629 // because a MOVSSmr can be used instead, which is smaller and faster.
18630 if (!Op.hasOneUse())
18631 return SDValue();
18632 SDNode *User = *Op.getNode()->user_begin();
18633 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18634 (User->getOpcode() != ISD::BITCAST ||
18635 User->getValueType(0) != MVT::i32))
18636 return SDValue();
18637 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18638 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18639 return DAG.getBitcast(MVT::f32, Extract);
18640 }
18641
18642 if (VT == MVT::i32 || VT == MVT::i64)
18643 return Op;
18644
18645 return SDValue();
18646}
18647
18648/// Extract one bit from mask vector, like v16i1 or v8i1.
18649/// AVX-512 feature.
18651 const X86Subtarget &Subtarget) {
18652 SDValue Vec = Op.getOperand(0);
18653 SDLoc dl(Vec);
18654 MVT VecVT = Vec.getSimpleValueType();
18655 SDValue Idx = Op.getOperand(1);
18656 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18657 MVT EltVT = Op.getSimpleValueType();
18658
18659 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18660 "Unexpected vector type in ExtractBitFromMaskVector");
18661
18662 // variable index can't be handled in mask registers,
18663 // extend vector to VR512/128
18664 if (!IdxC) {
18665 unsigned NumElts = VecVT.getVectorNumElements();
18666 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18667 // than extending to 128/256bit.
18668 if (NumElts == 1) {
18669 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18671 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18672 }
18673 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18674 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18675 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18676 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18677 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18678 }
18679
18680 unsigned IdxVal = IdxC->getZExtValue();
18681 if (IdxVal == 0) // the operation is legal
18682 return Op;
18683
18684 // Extend to natively supported kshift.
18685 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18686
18687 // Use kshiftr instruction to move to the lower element.
18688 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18689 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18690
18691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18692 DAG.getVectorIdxConstant(0, dl));
18693}
18694
18695// Helper to find all the extracted elements from a vector.
18697 MVT VT = N->getSimpleValueType(0);
18698 unsigned NumElts = VT.getVectorNumElements();
18699 APInt DemandedElts = APInt::getZero(NumElts);
18700 for (SDNode *User : N->users()) {
18701 switch (User->getOpcode()) {
18702 case X86ISD::PEXTRB:
18703 case X86ISD::PEXTRW:
18706 DemandedElts.setAllBits();
18707 return DemandedElts;
18708 }
18709 DemandedElts.setBit(User->getConstantOperandVal(1));
18710 break;
18711 case ISD::BITCAST: {
18712 if (!User->getValueType(0).isSimple() ||
18713 !User->getValueType(0).isVector()) {
18714 DemandedElts.setAllBits();
18715 return DemandedElts;
18716 }
18717 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18718 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18719 break;
18720 }
18721 default:
18722 DemandedElts.setAllBits();
18723 return DemandedElts;
18724 }
18725 }
18726 return DemandedElts;
18727}
18728
18729SDValue
18730X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18731 SelectionDAG &DAG) const {
18732 SDLoc dl(Op);
18733 SDValue Vec = Op.getOperand(0);
18734 MVT VecVT = Vec.getSimpleValueType();
18735 SDValue Idx = Op.getOperand(1);
18736 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18737
18738 if (VecVT.getVectorElementType() == MVT::i1)
18739 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18740
18741 if (!IdxC) {
18742 // Its more profitable to go through memory (1 cycles throughput)
18743 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18744 // IACA tool was used to get performance estimation
18745 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18746 //
18747 // example : extractelement <16 x i8> %a, i32 %i
18748 //
18749 // Block Throughput: 3.00 Cycles
18750 // Throughput Bottleneck: Port5
18751 //
18752 // | Num Of | Ports pressure in cycles | |
18753 // | Uops | 0 - DV | 5 | 6 | 7 | |
18754 // ---------------------------------------------
18755 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18756 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18757 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18758 // Total Num Of Uops: 4
18759 //
18760 //
18761 // Block Throughput: 1.00 Cycles
18762 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18763 //
18764 // | | Ports pressure in cycles | |
18765 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18766 // ---------------------------------------------------------
18767 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18768 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18769 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18770 // Total Num Of Uops: 4
18771
18772 return SDValue();
18773 }
18774
18775 unsigned IdxVal = IdxC->getZExtValue();
18776
18777 // If this is a 256-bit vector result, first extract the 128-bit vector and
18778 // then extract the element from the 128-bit vector.
18779 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18780 // Get the 128-bit vector.
18781 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18782 MVT EltVT = VecVT.getVectorElementType();
18783
18784 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18785 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18786
18787 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18788 // this can be done with a mask.
18789 IdxVal &= ElemsPerChunk - 1;
18790 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18791 DAG.getVectorIdxConstant(IdxVal, dl));
18792 }
18793
18794 assert(VecVT.is128BitVector() && "Unexpected vector length");
18795
18796 MVT VT = Op.getSimpleValueType();
18797
18798 if (VT == MVT::i16) {
18799 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18800 // we're going to zero extend the register or fold the store (SSE41 only).
18801 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18802 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18803 if (Subtarget.hasFP16())
18804 return Op;
18805
18806 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18807 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18808 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18809 }
18810
18811 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18812 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18813 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18814 }
18815
18816 if (Subtarget.hasSSE41())
18817 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18818 return Res;
18819
18820 // Only extract a single element from a v16i8 source - determine the common
18821 // DWORD/WORD that all extractions share, and extract the sub-byte.
18822 // TODO: Add QWORD MOVQ extraction?
18823 if (VT == MVT::i8) {
18824 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18825 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18826
18827 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18828 int DWordIdx = IdxVal / 4;
18829 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18830 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18831 DAG.getBitcast(MVT::v4i32, Vec),
18832 DAG.getVectorIdxConstant(DWordIdx, dl));
18833 int ShiftVal = (IdxVal % 4) * 8;
18834 if (ShiftVal != 0)
18835 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18836 DAG.getConstant(ShiftVal, dl, MVT::i8));
18837 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18838 }
18839
18840 int WordIdx = IdxVal / 2;
18841 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18842 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18843 DAG.getBitcast(MVT::v8i16, Vec),
18844 DAG.getVectorIdxConstant(WordIdx, dl));
18845 int ShiftVal = (IdxVal % 2) * 8;
18846 if (ShiftVal != 0)
18847 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18848 DAG.getConstant(ShiftVal, dl, MVT::i8));
18849 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18850 }
18851 }
18852
18853 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18854 if (IdxVal == 0)
18855 return Op;
18856
18857 // Shuffle the element to the lowest element, then movss or movsh.
18858 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18859 Mask[0] = static_cast<int>(IdxVal);
18860 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18861 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18862 DAG.getVectorIdxConstant(0, dl));
18863 }
18864
18865 if (VT.getSizeInBits() == 64) {
18866 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18867 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18868 // to match extract_elt for f64.
18869 if (IdxVal == 0)
18870 return Op;
18871
18872 // UNPCKHPD the element to the lowest double word, then movsd.
18873 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18874 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18875 int Mask[2] = { 1, -1 };
18876 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18877 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18878 DAG.getVectorIdxConstant(0, dl));
18879 }
18880
18881 return SDValue();
18882}
18883
18884/// Insert one bit to mask vector, like v16i1 or v8i1.
18885/// AVX-512 feature.
18887 const X86Subtarget &Subtarget) {
18888 SDLoc dl(Op);
18889 SDValue Vec = Op.getOperand(0);
18890 SDValue Elt = Op.getOperand(1);
18891 SDValue Idx = Op.getOperand(2);
18892 MVT VecVT = Vec.getSimpleValueType();
18893
18894 if (!isa<ConstantSDNode>(Idx)) {
18895 // Non constant index. Extend source and destination,
18896 // insert element and then truncate the result.
18897 unsigned NumElts = VecVT.getVectorNumElements();
18898 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18899 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18900 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18901 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18902 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18903 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18904 }
18905
18906 // Copy into a k-register, extract to v1i1 and insert_subvector.
18907 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18908 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18909}
18910
18911SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18912 SelectionDAG &DAG) const {
18913 MVT VT = Op.getSimpleValueType();
18914 MVT EltVT = VT.getVectorElementType();
18915 unsigned NumElts = VT.getVectorNumElements();
18916 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18917
18918 if (EltVT == MVT::i1)
18919 return InsertBitToMaskVector(Op, DAG, Subtarget);
18920
18921 SDLoc dl(Op);
18922 SDValue N0 = Op.getOperand(0);
18923 SDValue N1 = Op.getOperand(1);
18924 SDValue N2 = Op.getOperand(2);
18925 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18926
18927 if (EltVT == MVT::bf16) {
18928 MVT IVT = VT.changeVectorElementTypeToInteger();
18929 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18930 DAG.getBitcast(IVT, N0),
18931 DAG.getBitcast(MVT::i16, N1), N2);
18932 return DAG.getBitcast(VT, Res);
18933 }
18934
18935 if (!N2C) {
18936 // Variable insertion indices, usually we're better off spilling to stack,
18937 // but AVX512 can use a variable compare+select by comparing against all
18938 // possible vector indices, and FP insertion has less gpr->simd traffic.
18939 if (!(Subtarget.hasBWI() ||
18940 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18941 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18942 return SDValue();
18943
18944 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18945 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18946 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18947 return SDValue();
18948
18949 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18950 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18951 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18952
18953 SmallVector<SDValue, 16> RawIndices;
18954 for (unsigned I = 0; I != NumElts; ++I)
18955 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18956 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18957
18958 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18959 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18961 }
18962
18963 if (N2C->getAPIntValue().uge(NumElts))
18964 return SDValue();
18965 uint64_t IdxVal = N2C->getZExtValue();
18966
18967 bool IsZeroElt = X86::isZeroNode(N1);
18968 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18969
18970 if (IsZeroElt || IsAllOnesElt) {
18971 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18972 // We don't deal with i8 0 since it appears to be handled elsewhere.
18973 if (IsAllOnesElt &&
18974 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18975 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18976 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18977 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18978 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18979 CstVectorElts[IdxVal] = OnesCst;
18980 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18981 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18982 }
18983 // See if we can do this more efficiently with a blend shuffle with a
18984 // rematerializable vector.
18985 if (Subtarget.hasSSE41() &&
18986 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18987 SmallVector<int, 8> BlendMask;
18988 for (unsigned i = 0; i != NumElts; ++i)
18989 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18990 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18991 : getOnesVector(VT, DAG, dl);
18992 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18993 }
18994 }
18995
18996 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18997 // into that, and then insert the subvector back into the result.
18998 if (VT.is256BitVector() || VT.is512BitVector()) {
18999 // With a 256-bit vector, we can insert into the zero element efficiently
19000 // using a blend if we have AVX or AVX2 and the right data type.
19001 if (VT.is256BitVector() && IdxVal == 0) {
19002 // TODO: It is worthwhile to cast integer to floating point and back
19003 // and incur a domain crossing penalty if that's what we'll end up
19004 // doing anyway after extracting to a 128-bit vector.
19005 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19006 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19007 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19008 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19009 DAG.getTargetConstant(1, dl, MVT::i8));
19010 }
19011 }
19012
19013 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19014 assert(isPowerOf2_32(NumEltsIn128) &&
19015 "Vectors will always have power-of-two number of elements.");
19016
19017 // If we are not inserting into the low 128-bit vector chunk,
19018 // then prefer the broadcast+blend sequence.
19019 // FIXME: relax the profitability check iff all N1 uses are insertions.
19020 if (IdxVal >= NumEltsIn128 &&
19021 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19022 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19023 X86::mayFoldLoad(N1, Subtarget)))) {
19024 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19025 SmallVector<int, 8> BlendMask;
19026 for (unsigned i = 0; i != NumElts; ++i)
19027 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19028 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19029 }
19030
19031 // Get the desired 128-bit vector chunk.
19032 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19033
19034 // Insert the element into the desired chunk.
19035 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19036 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19037
19038 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19039 DAG.getVectorIdxConstant(IdxIn128, dl));
19040
19041 // Insert the changed part back into the bigger vector
19042 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19043 }
19044 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19045
19046 // This will be just movw/movd/movq/movsh/movss/movsd.
19047 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19048 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19049 EltVT == MVT::f16 || EltVT == MVT::i64) {
19050 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19051 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19052 }
19053
19054 // We can't directly insert an i8 or i16 into a vector, so zero extend
19055 // it to i32 first.
19056 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19057 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19058 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19059 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19060 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19061 return DAG.getBitcast(VT, N1);
19062 }
19063 }
19064
19065 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19066 // argument. SSE41 required for pinsrb.
19067 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19068 unsigned Opc;
19069 if (VT == MVT::v8i16) {
19070 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19072 } else {
19073 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19074 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19076 }
19077
19078 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19079 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19080 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19081 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19082 }
19083
19084 if (Subtarget.hasSSE41()) {
19085 if (EltVT == MVT::f32) {
19086 // Bits [7:6] of the constant are the source select. This will always be
19087 // zero here. The DAG Combiner may combine an extract_elt index into
19088 // these bits. For example (insert (extract, 3), 2) could be matched by
19089 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19090 // Bits [5:4] of the constant are the destination select. This is the
19091 // value of the incoming immediate.
19092 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19093 // combine either bitwise AND or insert of float 0.0 to set these bits.
19094
19095 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19096 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19097 // If this is an insertion of 32-bits into the low 32-bits of
19098 // a vector, we prefer to generate a blend with immediate rather
19099 // than an insertps. Blends are simpler operations in hardware and so
19100 // will always have equal or better performance than insertps.
19101 // But if optimizing for size and there's a load folding opportunity,
19102 // generate insertps because blendps does not have a 32-bit memory
19103 // operand form.
19104 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19105 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19106 DAG.getTargetConstant(1, dl, MVT::i8));
19107 }
19108 // Create this as a scalar to vector..
19109 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19110 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19111 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19112 }
19113
19114 // PINSR* works with constant index.
19115 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19116 return Op;
19117 }
19118
19119 return SDValue();
19120}
19121
19123 SelectionDAG &DAG) {
19124 SDLoc dl(Op);
19125 MVT OpVT = Op.getSimpleValueType();
19126
19127 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19128 // combines.
19129 if (X86::isZeroNode(Op.getOperand(0)))
19130 return getZeroVector(OpVT, Subtarget, DAG, dl);
19131
19132 // If this is a 256-bit vector result, first insert into a 128-bit
19133 // vector and then insert into the 256-bit vector.
19134 if (!OpVT.is128BitVector()) {
19135 // Insert into a 128-bit vector.
19136 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19138 OpVT.getVectorNumElements() / SizeFactor);
19139
19140 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19141
19142 // Insert the 128-bit vector.
19143 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19144 }
19145 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19146 "Expected an SSE type!");
19147
19148 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19149 // tblgen.
19150 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19151 return Op;
19152
19153 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19154 return DAG.getBitcast(
19155 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19156}
19157
19158// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19159// simple superregister reference or explicit instructions to insert
19160// the upper bits of a vector.
19162 SelectionDAG &DAG) {
19163 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19164
19165 return insert1BitVector(Op, DAG, Subtarget);
19166}
19167
19169 SelectionDAG &DAG) {
19170 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19171 "Only vXi1 extract_subvectors need custom lowering");
19172
19173 SDLoc dl(Op);
19174 SDValue Vec = Op.getOperand(0);
19175 uint64_t IdxVal = Op.getConstantOperandVal(1);
19176
19177 if (IdxVal == 0) // the operation is legal
19178 return Op;
19179
19180 // Extend to natively supported kshift.
19181 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19182
19183 // Shift to the LSB.
19184 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19185 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19186
19187 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19188 DAG.getVectorIdxConstant(0, dl));
19189}
19190
19191// Returns the appropriate wrapper opcode for a global reference.
19192unsigned X86TargetLowering::getGlobalWrapperKind(
19193 const GlobalValue *GV, const unsigned char OpFlags) const {
19194 // References to absolute symbols are never PC-relative.
19195 if (GV && GV->isAbsoluteSymbolRef())
19196 return X86ISD::Wrapper;
19197
19198 // The following OpFlags under RIP-rel PIC use RIP.
19199 if (Subtarget.isPICStyleRIPRel() &&
19200 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19201 OpFlags == X86II::MO_DLLIMPORT))
19202 return X86ISD::WrapperRIP;
19203
19204 // GOTPCREL references must always use RIP.
19205 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19206 return X86ISD::WrapperRIP;
19207
19208 return X86ISD::Wrapper;
19209}
19210
19211// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19212// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19213// one of the above mentioned nodes. It has to be wrapped because otherwise
19214// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19215// be used to form addressing mode. These wrapped nodes will be selected
19216// into MOV32ri.
19217SDValue
19218X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19219 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19220
19221 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19222 // global base reg.
19223 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19224
19225 auto PtrVT = getPointerTy(DAG.getDataLayout());
19227 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19228 SDLoc DL(CP);
19229 Result =
19230 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19231 // With PIC, the address is actually $g + Offset.
19232 if (OpFlag) {
19233 Result =
19234 DAG.getNode(ISD::ADD, DL, PtrVT,
19235 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19236 }
19237
19238 return Result;
19239}
19240
19241SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19242 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19243
19244 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19245 // global base reg.
19246 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19247
19248 EVT PtrVT = Op.getValueType();
19249 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19250 SDLoc DL(JT);
19251 Result =
19252 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19253
19254 // With PIC, the address is actually $g + Offset.
19255 if (OpFlag)
19256 Result =
19257 DAG.getNode(ISD::ADD, DL, PtrVT,
19258 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19259
19260 return Result;
19261}
19262
19263SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19264 SelectionDAG &DAG) const {
19265 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19266}
19267
19268SDValue
19269X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19270 // Create the TargetBlockAddressAddress node.
19271 unsigned char OpFlags =
19272 Subtarget.classifyBlockAddressReference();
19273 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19274 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19275 SDLoc dl(Op);
19276 EVT PtrVT = Op.getValueType();
19277 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19278 Result =
19279 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19280
19281 // With PIC, the address is actually $g + Offset.
19282 if (isGlobalRelativeToPICBase(OpFlags)) {
19283 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19284 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19285 }
19286
19287 return Result;
19288}
19289
19290/// Creates target global address or external symbol nodes for calls or
19291/// other uses.
19292SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19293 bool ForCall,
19294 bool *IsImpCall) const {
19295 // Unpack the global address or external symbol.
19296 SDLoc dl(Op);
19297 const GlobalValue *GV = nullptr;
19298 int64_t Offset = 0;
19299 const char *ExternalSym = nullptr;
19300 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19301 GV = G->getGlobal();
19302 Offset = G->getOffset();
19303 } else {
19304 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19305 ExternalSym = ES->getSymbol();
19306 }
19307
19308 // Calculate some flags for address lowering.
19310 unsigned char OpFlags;
19311 if (ForCall)
19312 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19313 else
19314 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19315 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19316 bool NeedsLoad = isGlobalStubReference(OpFlags);
19317
19319 EVT PtrVT = Op.getValueType();
19321
19322 if (GV) {
19323 // Create a target global address if this is a global. If possible, fold the
19324 // offset into the global address reference. Otherwise, ADD it on later.
19325 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19326 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19327 // relocation will compute to a negative value, which is invalid.
19328 int64_t GlobalOffset = 0;
19329 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19331 std::swap(GlobalOffset, Offset);
19332 }
19333 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19334 } else {
19335 // If this is not a global address, this must be an external symbol.
19336 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19337 }
19338
19339 // If this is a direct call, avoid the wrapper if we don't need to do any
19340 // loads or adds. This allows SDAG ISel to match direct calls.
19341 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19342 return Result;
19343
19344 // If Import Call Optimization is enabled and this is an imported function
19345 // then make a note of it and return the global address without wrapping.
19346 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19347 Mod.getModuleFlag("import-call-optimization")) {
19348 assert(ForCall && "Should only enable import call optimization if we are "
19349 "lowering a call");
19350 *IsImpCall = true;
19351 return Result;
19352 }
19353
19354 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19355
19356 // With PIC, the address is actually $g + Offset.
19357 if (HasPICReg) {
19358 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19359 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19360 }
19361
19362 // For globals that require a load from a stub to get the address, emit the
19363 // load.
19364 if (NeedsLoad)
19365 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19367
19368 // If there was a non-zero offset that we didn't fold, create an explicit
19369 // addition for it.
19370 if (Offset != 0)
19371 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19372 DAG.getSignedConstant(Offset, dl, PtrVT));
19373
19374 return Result;
19375}
19376
19377SDValue
19378X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19379 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19380}
19381
19383 const EVT PtrVT, unsigned ReturnReg,
19384 unsigned char OperandFlags,
19385 bool LoadGlobalBaseReg = false,
19386 bool LocalDynamic = false) {
19388 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19389 SDLoc dl(GA);
19390 SDValue TGA;
19391 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19392 SDValue Chain = DAG.getEntryNode();
19393 SDValue Ret;
19394 if (LocalDynamic && UseTLSDESC) {
19395 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19396 // Reuse existing GetTLSADDR node if we can find it.
19397 if (TGA->hasOneUse()) {
19398 // TLSDESC uses TGA.
19399 SDNode *TLSDescOp = *TGA->user_begin();
19400 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19401 "Unexpected TLSDESC DAG");
19402 // CALLSEQ_END uses TGA via a chain and glue.
19403 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19404 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19405 "Unexpected TLSDESC DAG");
19406 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19407 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19408 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19409 "Unexpected TLSDESC DAG");
19410 Ret = SDValue(CopyFromRegOp, 0);
19411 }
19412 } else {
19413 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19414 GA->getOffset(), OperandFlags);
19415 }
19416
19417 if (!Ret) {
19418 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19419 : LocalDynamic ? X86ISD::TLSBASEADDR
19421
19422 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19423 if (LoadGlobalBaseReg) {
19424 SDValue InGlue;
19425 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19426 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19427 InGlue);
19428 InGlue = Chain.getValue(1);
19429 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19430 } else {
19431 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19432 }
19433 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19434
19435 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19436 MFI.setHasCalls(true);
19437
19438 SDValue Glue = Chain.getValue(1);
19439 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19440 }
19441
19442 if (!UseTLSDESC)
19443 return Ret;
19444
19445 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19446 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19447
19449 SDValue Offset =
19450 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19452 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19453}
19454
19455// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19456static SDValue
19458 const EVT PtrVT) {
19459 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19460 /*LoadGlobalBaseReg=*/true);
19461}
19462
19463// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19464static SDValue
19466 const EVT PtrVT) {
19467 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19468}
19469
19470// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19471static SDValue
19473 const EVT PtrVT) {
19474 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19475}
19476
19478 SelectionDAG &DAG, const EVT PtrVT,
19479 bool Is64Bit, bool Is64BitLP64) {
19480 SDLoc dl(GA);
19481
19482 // Get the start address of the TLS block for this module.
19486
19487 SDValue Base;
19488 if (Is64Bit) {
19489 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19490 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19491 /*LoadGlobalBaseReg=*/false,
19492 /*LocalDynamic=*/true);
19493 } else {
19494 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19495 /*LoadGlobalBaseReg=*/true,
19496 /*LocalDynamic=*/true);
19497 }
19498
19499 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19500 // of Base.
19501
19502 // Build x@dtpoff.
19503 unsigned char OperandFlags = X86II::MO_DTPOFF;
19504 unsigned WrapperKind = X86ISD::Wrapper;
19505 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19506 GA->getValueType(0),
19507 GA->getOffset(), OperandFlags);
19508 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19509
19510 // Add x@dtpoff with the base.
19511 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19512}
19513
19514// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19516 const EVT PtrVT, TLSModel::Model model,
19517 bool is64Bit, bool isPIC) {
19518 SDLoc dl(GA);
19519
19520 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19523
19524 SDValue ThreadPointer =
19525 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19527
19528 unsigned char OperandFlags = 0;
19529 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19530 // initialexec.
19531 unsigned WrapperKind = X86ISD::Wrapper;
19532 if (model == TLSModel::LocalExec) {
19533 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19534 } else if (model == TLSModel::InitialExec) {
19535 if (is64Bit) {
19536 OperandFlags = X86II::MO_GOTTPOFF;
19537 WrapperKind = X86ISD::WrapperRIP;
19538 } else {
19539 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19540 }
19541 } else {
19542 llvm_unreachable("Unexpected model");
19543 }
19544
19545 // emit "addl x@ntpoff,%eax" (local exec)
19546 // or "addl x@indntpoff,%eax" (initial exec)
19547 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19548 SDValue TGA =
19549 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19550 GA->getOffset(), OperandFlags);
19551 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19552
19553 if (model == TLSModel::InitialExec) {
19554 if (isPIC && !is64Bit) {
19555 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19556 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19557 Offset);
19558 }
19559
19560 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19562 }
19563
19564 // The address of the thread local variable is the add of the thread
19565 // pointer with the offset of the variable.
19566 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19567}
19568
19569SDValue
19570X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19571
19572 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19573
19574 if (DAG.getTarget().useEmulatedTLS())
19575 return LowerToTLSEmulatedModel(GA, DAG);
19576
19577 const GlobalValue *GV = GA->getGlobal();
19578 EVT PtrVT = Op.getValueType();
19579 bool PositionIndependent = isPositionIndependent();
19580
19581 if (Subtarget.isTargetELF()) {
19582 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19583 switch (model) {
19585 if (Subtarget.is64Bit()) {
19586 if (Subtarget.isTarget64BitLP64())
19587 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19588 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19589 }
19590 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19592 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19593 Subtarget.isTarget64BitLP64());
19596 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19597 PositionIndependent);
19598 }
19599 llvm_unreachable("Unknown TLS model.");
19600 }
19601
19602 if (Subtarget.isTargetDarwin()) {
19603 // Darwin only has one model of TLS. Lower to that.
19604 unsigned char OpFlag = 0;
19605 unsigned WrapperKind = 0;
19606
19607 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19608 // global base reg.
19609 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19610 if (PIC32) {
19611 OpFlag = X86II::MO_TLVP_PIC_BASE;
19612 WrapperKind = X86ISD::Wrapper;
19613 } else {
19614 OpFlag = X86II::MO_TLVP;
19615 WrapperKind = X86ISD::WrapperRIP;
19616 }
19617 SDLoc DL(Op);
19619 GA->getValueType(0),
19620 GA->getOffset(), OpFlag);
19621 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19622
19623 // With PIC32, the address is actually $g + Offset.
19624 if (PIC32)
19625 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19626 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19627 Offset);
19628
19629 // Lowering the machine isd will make sure everything is in the right
19630 // location.
19631 SDValue Chain = DAG.getEntryNode();
19632 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19633 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19634 SDValue Args[] = { Chain, Offset };
19635 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19636 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19637
19638 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19639 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19640 MFI.setAdjustsStack(true);
19641
19642 // And our return value (tls address) is in the standard call return value
19643 // location.
19644 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19645 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19646 }
19647
19648 if (Subtarget.isOSWindows()) {
19649 // Just use the implicit TLS architecture
19650 // Need to generate something similar to:
19651 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19652 // ; from TEB
19653 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19654 // mov rcx, qword [rdx+rcx*8]
19655 // mov eax, .tls$:tlsvar
19656 // [rax+rcx] contains the address
19657 // Windows 64bit: gs:0x58
19658 // Windows 32bit: fs:__tls_array
19659
19660 SDLoc dl(GA);
19661 SDValue Chain = DAG.getEntryNode();
19662
19663 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19664 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19665 // use its literal value of 0x2C.
19667 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19669
19670 SDValue TlsArray = Subtarget.is64Bit()
19671 ? DAG.getIntPtrConstant(0x58, dl)
19672 : (Subtarget.isTargetWindowsGNU()
19673 ? DAG.getIntPtrConstant(0x2C, dl)
19674 : DAG.getExternalSymbol("_tls_array", PtrVT));
19675
19677 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19678
19679 SDValue res;
19681 res = ThreadPointer;
19682 } else {
19683 // Load the _tls_index variable
19684 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19685 if (Subtarget.is64Bit())
19686 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19687 MachinePointerInfo(), MVT::i32);
19688 else
19689 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19690
19691 const DataLayout &DL = DAG.getDataLayout();
19692 SDValue Scale =
19693 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19694 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19695
19696 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19697 }
19698
19699 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19700
19701 // Get the offset of start of .tls section
19702 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19703 GA->getValueType(0),
19705 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19706
19707 // The address of the thread local variable is the add of the thread
19708 // pointer with the offset of the variable.
19709 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19710 }
19711
19712 llvm_unreachable("TLS not implemented for this target.");
19713}
19714
19716 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19717 const TargetMachine &TM = getTargetMachine();
19718 TLSModel::Model Model = TM.getTLSModel(&GV);
19719 switch (Model) {
19722 // We can include the %fs segment register in addressing modes.
19723 return true;
19726 // These models do not result in %fs relative addresses unless
19727 // TLS descriptior are used.
19728 //
19729 // Even in the case of TLS descriptors we currently have no way to model
19730 // the difference between %fs access and the computations needed for the
19731 // offset and returning `true` for TLS-desc currently duplicates both
19732 // which is detrimental :-/
19733 return false;
19734 }
19735 }
19736 return false;
19737}
19738
19739/// Lower SRA_PARTS and friends, which return two i32 values
19740/// and take a 2 x i32 value to shift plus a shift amount.
19741/// TODO: Can this be moved to general expansion code?
19743 SDValue Lo, Hi;
19744 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19745 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19746}
19747
19748// Try to use a packed vector operation to handle i64 on 32-bit targets when
19749// AVX512DQ is enabled.
19751 SelectionDAG &DAG,
19752 const X86Subtarget &Subtarget) {
19753 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19754 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19755 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19756 Op.getOpcode() == ISD::UINT_TO_FP) &&
19757 "Unexpected opcode!");
19758 bool IsStrict = Op->isStrictFPOpcode();
19759 unsigned OpNo = IsStrict ? 1 : 0;
19760 SDValue Src = Op.getOperand(OpNo);
19761 MVT SrcVT = Src.getSimpleValueType();
19762 MVT VT = Op.getSimpleValueType();
19763
19764 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19765 (VT != MVT::f32 && VT != MVT::f64))
19766 return SDValue();
19767
19768 // Pack the i64 into a vector, do the operation and extract.
19769
19770 // Using 256-bit to ensure result is 128-bits for f32 case.
19771 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19772 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19773 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19774
19775 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19776 if (IsStrict) {
19777 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19778 {Op.getOperand(0), InVec});
19779 SDValue Chain = CvtVec.getValue(1);
19780 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19781 DAG.getVectorIdxConstant(0, dl));
19782 return DAG.getMergeValues({Value, Chain}, dl);
19783 }
19784
19785 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19786
19787 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19788 DAG.getVectorIdxConstant(0, dl));
19789}
19790
19791// Try to use a packed vector operation to handle i64 on 32-bit targets.
19793 const X86Subtarget &Subtarget) {
19794 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19795 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19796 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19797 Op.getOpcode() == ISD::UINT_TO_FP) &&
19798 "Unexpected opcode!");
19799 bool IsStrict = Op->isStrictFPOpcode();
19800 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19801 MVT SrcVT = Src.getSimpleValueType();
19802 MVT VT = Op.getSimpleValueType();
19803
19804 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19805 return SDValue();
19806
19807 // Pack the i64 into a vector, do the operation and extract.
19808
19809 assert(Subtarget.hasFP16() && "Expected FP16");
19810
19811 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19812 if (IsStrict) {
19813 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19814 {Op.getOperand(0), InVec});
19815 SDValue Chain = CvtVec.getValue(1);
19816 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19817 DAG.getVectorIdxConstant(0, dl));
19818 return DAG.getMergeValues({Value, Chain}, dl);
19819 }
19820
19821 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19822
19823 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19824 DAG.getVectorIdxConstant(0, dl));
19825}
19826
19827static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19828 const X86Subtarget &Subtarget) {
19829 switch (Opcode) {
19830 case ISD::SINT_TO_FP:
19831 // TODO: Handle wider types with AVX/AVX512.
19832 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19833 return false;
19834 // CVTDQ2PS or (V)CVTDQ2PD
19835 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19836
19837 case ISD::UINT_TO_FP:
19838 // TODO: Handle wider types and i64 elements.
19839 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19840 return false;
19841 // VCVTUDQ2PS or VCVTUDQ2PD
19842 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19843
19844 default:
19845 return false;
19846 }
19847}
19848
19849/// Given a scalar cast operation that is extracted from a vector, try to
19850/// vectorize the cast op followed by extraction. This will avoid an expensive
19851/// round-trip between XMM and GPR.
19853 SelectionDAG &DAG,
19854 const X86Subtarget &Subtarget) {
19855 // TODO: This could be enhanced to handle smaller integer types by peeking
19856 // through an extend.
19857 SDValue Extract = Cast.getOperand(0);
19858 MVT DestVT = Cast.getSimpleValueType();
19859 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19860 !isa<ConstantSDNode>(Extract.getOperand(1)))
19861 return SDValue();
19862
19863 // See if we have a 128-bit vector cast op for this type of cast.
19864 SDValue VecOp = Extract.getOperand(0);
19865 MVT FromVT = VecOp.getSimpleValueType();
19866 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19867 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19868 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19869 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19870 return SDValue();
19871
19872 // If we are extracting from a non-zero element, first shuffle the source
19873 // vector to allow extracting from element zero.
19874 if (!isNullConstant(Extract.getOperand(1))) {
19875 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19876 Mask[0] = Extract.getConstantOperandVal(1);
19877 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19878 }
19879 // If the source vector is wider than 128-bits, extract the low part. Do not
19880 // create an unnecessarily wide vector cast op.
19881 if (FromVT != Vec128VT)
19882 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19883
19884 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19885 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19886 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19887 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19888 DAG.getVectorIdxConstant(0, DL));
19889}
19890
19891/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19892/// try to vectorize the cast ops. This will avoid an expensive round-trip
19893/// between XMM and GPR.
19894static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19895 SelectionDAG &DAG,
19896 const X86Subtarget &Subtarget) {
19897 // TODO: Allow FP_TO_UINT.
19898 SDValue CastToInt = CastToFP.getOperand(0);
19899 MVT VT = CastToFP.getSimpleValueType();
19900 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19901 return SDValue();
19902
19903 MVT IntVT = CastToInt.getSimpleValueType();
19904 SDValue X = CastToInt.getOperand(0);
19905 MVT SrcVT = X.getSimpleValueType();
19906 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19907 return SDValue();
19908
19909 // See if we have 128-bit vector cast instructions for this type of cast.
19910 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19911 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19912 IntVT != MVT::i32)
19913 return SDValue();
19914
19915 unsigned SrcSize = SrcVT.getSizeInBits();
19916 unsigned IntSize = IntVT.getSizeInBits();
19917 unsigned VTSize = VT.getSizeInBits();
19918 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19919 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19920 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19921
19922 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19923 unsigned ToIntOpcode =
19924 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19925 unsigned ToFPOpcode =
19926 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19927
19928 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19929 //
19930 // We are not defining the high elements (for example, zero them) because
19931 // that could nullify any performance advantage that we hoped to gain from
19932 // this vector op hack. We do not expect any adverse effects (like denorm
19933 // penalties) with cast ops.
19934 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19935 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19936 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19937 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19938 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19939}
19940
19942 SelectionDAG &DAG,
19943 const X86Subtarget &Subtarget) {
19944 bool IsStrict = Op->isStrictFPOpcode();
19945 MVT VT = Op->getSimpleValueType(0);
19946 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19947
19948 if (Subtarget.hasDQI()) {
19949 assert(!Subtarget.hasVLX() && "Unexpected features");
19950
19951 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19952 Src.getSimpleValueType() == MVT::v4i64) &&
19953 "Unsupported custom type");
19954
19955 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19956 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19957 "Unexpected VT!");
19958 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19959
19960 // Need to concat with zero vector for strict fp to avoid spurious
19961 // exceptions.
19962 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19963 : DAG.getUNDEF(MVT::v8i64);
19964 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19965 DAG.getVectorIdxConstant(0, DL));
19966 SDValue Res, Chain;
19967 if (IsStrict) {
19968 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19969 {Op->getOperand(0), Src});
19970 Chain = Res.getValue(1);
19971 } else {
19972 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19973 }
19974
19975 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19976 DAG.getVectorIdxConstant(0, DL));
19977
19978 if (IsStrict)
19979 return DAG.getMergeValues({Res, Chain}, DL);
19980 return Res;
19981 }
19982
19983 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19984 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19985 if (VT != MVT::v4f32 || IsSigned)
19986 return SDValue();
19987
19988 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19989 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19990 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19991 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19992 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19993 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19994 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19995 SmallVector<SDValue, 4> SignCvts(4);
19996 SmallVector<SDValue, 4> Chains(4);
19997 for (int i = 0; i != 4; ++i) {
19998 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19999 DAG.getVectorIdxConstant(i, DL));
20000 if (IsStrict) {
20001 SignCvts[i] =
20002 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20003 {Op.getOperand(0), Elt});
20004 Chains[i] = SignCvts[i].getValue(1);
20005 } else {
20006 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20007 }
20008 }
20009 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20010
20011 SDValue Slow, Chain;
20012 if (IsStrict) {
20013 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20014 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20015 {Chain, SignCvt, SignCvt});
20016 Chain = Slow.getValue(1);
20017 } else {
20018 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20019 }
20020
20021 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20022 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20023
20024 if (IsStrict)
20025 return DAG.getMergeValues({Cvt, Chain}, DL);
20026
20027 return Cvt;
20028}
20029
20031 SelectionDAG &DAG) {
20032 bool IsStrict = Op->isStrictFPOpcode();
20033 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20034 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20035 MVT VT = Op.getSimpleValueType();
20036 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20037
20038 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20039 if (IsStrict)
20040 return DAG.getNode(
20041 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20042 {Chain,
20043 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20044 Rnd});
20045 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20046 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20047}
20048
20049static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20050 const X86Subtarget &Subtarget) {
20051 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20052 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20053 return true;
20054 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20055 return true;
20056 }
20057 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20058 return true;
20059 if (Subtarget.useAVX512Regs()) {
20060 if (VT == MVT::v16i32)
20061 return true;
20062 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20063 return true;
20064 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20065 return true;
20066 }
20067 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20068 (VT == MVT::v2i64 || VT == MVT::v4i64))
20069 return true;
20070 return false;
20071}
20072
20073SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20074 SelectionDAG &DAG) const {
20075 bool IsStrict = Op->isStrictFPOpcode();
20076 unsigned OpNo = IsStrict ? 1 : 0;
20077 SDValue Src = Op.getOperand(OpNo);
20078 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20079 MVT SrcVT = Src.getSimpleValueType();
20080 MVT VT = Op.getSimpleValueType();
20081 SDLoc dl(Op);
20082
20083 if (isSoftF16(VT, Subtarget))
20084 return promoteXINT_TO_FP(Op, dl, DAG);
20085 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20086 return Op;
20087
20088 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20089 return LowerWin64_INT128_TO_FP(Op, DAG);
20090
20091 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20092 return Extract;
20093
20094 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20095 return R;
20096
20097 if (SrcVT.isVector()) {
20098 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20099 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20100 // source for strict FP.
20101 if (IsStrict)
20102 return DAG.getNode(
20103 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20104 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20105 DAG.getUNDEF(SrcVT))});
20106 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20107 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20108 DAG.getUNDEF(SrcVT)));
20109 }
20110 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20111 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20112
20113 return SDValue();
20114 }
20115
20116 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20117 "Unknown SINT_TO_FP to lower!");
20118
20119 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20120
20121 // These are really Legal; return the operand so the caller accepts it as
20122 // Legal.
20123 if (SrcVT == MVT::i32 && UseSSEReg)
20124 return Op;
20125 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20126 return Op;
20127
20128 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20129 return V;
20130 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20131 return V;
20132
20133 // SSE doesn't have an i16 conversion so we need to promote.
20134 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20135 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20136 if (IsStrict)
20137 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20138 {Chain, Ext});
20139
20140 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20141 }
20142
20143 if (VT == MVT::f128 || !Subtarget.hasX87())
20144 return SDValue();
20145
20146 SDValue ValueToStore = Src;
20147 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20148 // Bitcasting to f64 here allows us to do a single 64-bit store from
20149 // an SSE register, avoiding the store forwarding penalty that would come
20150 // with two 32-bit stores.
20151 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20152
20153 unsigned Size = SrcVT.getStoreSize();
20154 Align Alignment(Size);
20155 MachineFunction &MF = DAG.getMachineFunction();
20156 auto PtrVT = getPointerTy(MF.getDataLayout());
20157 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20158 MachinePointerInfo MPI =
20160 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20161 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20162 std::pair<SDValue, SDValue> Tmp =
20163 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20164
20165 if (IsStrict)
20166 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20167
20168 return Tmp.first;
20169}
20170
20171std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20172 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20173 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20174 // Build the FILD
20175 SDVTList Tys;
20176 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20177 if (useSSE)
20178 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20179 else
20180 Tys = DAG.getVTList(DstVT, MVT::Other);
20181
20182 SDValue FILDOps[] = {Chain, Pointer};
20183 SDValue Result =
20184 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20185 Alignment, MachineMemOperand::MOLoad);
20186 Chain = Result.getValue(1);
20187
20188 if (useSSE) {
20190 unsigned SSFISize = DstVT.getStoreSize();
20191 int SSFI =
20192 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20193 auto PtrVT = getPointerTy(MF.getDataLayout());
20194 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20195 Tys = DAG.getVTList(MVT::Other);
20196 SDValue FSTOps[] = {Chain, Result, StackSlot};
20199 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20200
20201 Chain =
20202 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20203 Result = DAG.getLoad(
20204 DstVT, DL, Chain, StackSlot,
20206 Chain = Result.getValue(1);
20207 }
20208
20209 return { Result, Chain };
20210}
20211
20212/// Horizontal vector math instructions may be slower than normal math with
20213/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20214/// implementation, and likely shuffle complexity of the alternate sequence.
20215static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20216 const X86Subtarget &Subtarget) {
20217 bool IsOptimizingSize = DAG.shouldOptForSize();
20218 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20219 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20220}
20221
20222/// 64-bit unsigned integer to double expansion.
20224 SelectionDAG &DAG,
20225 const X86Subtarget &Subtarget) {
20226 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20227 // when converting 0 when rounding toward negative infinity. Caller will
20228 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20229 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20230 // This algorithm is not obvious. Here it is what we're trying to output:
20231 /*
20232 movq %rax, %xmm0
20233 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20234 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20235 #ifdef __SSE3__
20236 haddpd %xmm0, %xmm0
20237 #else
20238 pshufd $0x4e, %xmm0, %xmm1
20239 addpd %xmm1, %xmm0
20240 #endif
20241 */
20242
20243 LLVMContext *Context = DAG.getContext();
20244
20245 // Build some magic constants.
20246 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20247 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20248 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20249 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20250
20252 CV1.push_back(
20253 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20254 APInt(64, 0x4330000000000000ULL))));
20255 CV1.push_back(
20256 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20257 APInt(64, 0x4530000000000000ULL))));
20258 Constant *C1 = ConstantVector::get(CV1);
20259 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20260
20261 // Load the 64-bit value into an XMM register.
20262 SDValue XR1 =
20263 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20264 SDValue CLod0 = DAG.getLoad(
20265 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20267 SDValue Unpck1 =
20268 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20269
20270 SDValue CLod1 = DAG.getLoad(
20271 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20273 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20274 // TODO: Are there any fast-math-flags to propagate here?
20275 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20276 SDValue Result;
20277
20278 if (Subtarget.hasSSE3() &&
20279 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20280 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20281 } else {
20282 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20283 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20284 }
20285 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20286 DAG.getVectorIdxConstant(0, dl));
20287 return Result;
20288}
20289
20290/// 32-bit unsigned integer to float expansion.
20292 SelectionDAG &DAG,
20293 const X86Subtarget &Subtarget) {
20294 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20295 // FP constant to bias correct the final result.
20296 SDValue Bias = DAG.getConstantFP(
20297 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20298
20299 // Load the 32-bit value into an XMM register.
20300 SDValue Load =
20301 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20302
20303 // Zero out the upper parts of the register.
20304 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20305
20306 // Or the load with the bias.
20307 SDValue Or = DAG.getNode(
20308 ISD::OR, dl, MVT::v2i64,
20309 DAG.getBitcast(MVT::v2i64, Load),
20310 DAG.getBitcast(MVT::v2i64,
20311 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20312 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20313 DAG.getBitcast(MVT::v2f64, Or),
20314 DAG.getVectorIdxConstant(0, dl));
20315
20316 if (Op.getNode()->isStrictFPOpcode()) {
20317 // Subtract the bias.
20318 // TODO: Are there any fast-math-flags to propagate here?
20319 SDValue Chain = Op.getOperand(0);
20320 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20321 {Chain, Or, Bias});
20322
20323 if (Op.getValueType() == Sub.getValueType())
20324 return Sub;
20325
20326 // Handle final rounding.
20327 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20328 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20329
20330 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20331 }
20332
20333 // Subtract the bias.
20334 // TODO: Are there any fast-math-flags to propagate here?
20335 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20336
20337 // Handle final rounding.
20338 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20339}
20340
20342 SelectionDAG &DAG,
20343 const X86Subtarget &Subtarget) {
20344 if (Op.getSimpleValueType() != MVT::v2f64)
20345 return SDValue();
20346
20347 bool IsStrict = Op->isStrictFPOpcode();
20348
20349 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20350 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20351
20352 if (Subtarget.hasAVX512()) {
20353 if (!Subtarget.hasVLX()) {
20354 // Let generic type legalization widen this.
20355 if (!IsStrict)
20356 return SDValue();
20357 // Otherwise pad the integer input with 0s and widen the operation.
20358 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20359 DAG.getConstant(0, DL, MVT::v2i32));
20360 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20361 {Op.getOperand(0), N0});
20362 SDValue Chain = Res.getValue(1);
20363 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20364 DAG.getVectorIdxConstant(0, DL));
20365 return DAG.getMergeValues({Res, Chain}, DL);
20366 }
20367
20368 // Legalize to v4i32 type.
20369 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20370 DAG.getUNDEF(MVT::v2i32));
20371 if (IsStrict)
20372 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20373 {Op.getOperand(0), N0});
20374 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20375 }
20376
20377 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20378 // This gives us the floating point equivalent of 2^52 + the i32 integer
20379 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20380 // point leaving just our i32 integers in double format.
20381 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20382 SDValue VBias = DAG.getConstantFP(
20383 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20384 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20385 DAG.getBitcast(MVT::v2i64, VBias));
20386 Or = DAG.getBitcast(MVT::v2f64, Or);
20387
20388 if (IsStrict)
20389 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20390 {Op.getOperand(0), Or, VBias});
20391 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20392}
20393
20395 SelectionDAG &DAG,
20396 const X86Subtarget &Subtarget) {
20397 bool IsStrict = Op->isStrictFPOpcode();
20398 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20399 MVT VecIntVT = V.getSimpleValueType();
20400 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20401 "Unsupported custom type");
20402
20403 if (Subtarget.hasAVX512()) {
20404 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20405 assert(!Subtarget.hasVLX() && "Unexpected features");
20406 MVT VT = Op->getSimpleValueType(0);
20407
20408 // v8i32->v8f64 is legal with AVX512 so just return it.
20409 if (VT == MVT::v8f64)
20410 return Op;
20411
20412 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20413 VT == MVT::v8f16) &&
20414 "Unexpected VT!");
20415 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20416 MVT WideIntVT = MVT::v16i32;
20417 if (VT == MVT::v4f64) {
20418 WideVT = MVT::v8f64;
20419 WideIntVT = MVT::v8i32;
20420 }
20421
20422 // Need to concat with zero vector for strict fp to avoid spurious
20423 // exceptions.
20424 SDValue Tmp =
20425 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20426 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20427 DAG.getVectorIdxConstant(0, DL));
20428 SDValue Res, Chain;
20429 if (IsStrict) {
20430 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20431 {Op->getOperand(0), V});
20432 Chain = Res.getValue(1);
20433 } else {
20434 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20435 }
20436
20437 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20438 DAG.getVectorIdxConstant(0, DL));
20439
20440 if (IsStrict)
20441 return DAG.getMergeValues({Res, Chain}, DL);
20442 return Res;
20443 }
20444
20445 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20446 Op->getSimpleValueType(0) == MVT::v4f64) {
20447 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20448 Constant *Bias = ConstantFP::get(
20449 *DAG.getContext(),
20450 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20451 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20452 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20453 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20454 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20455 SDValue VBias = DAG.getMemIntrinsicNode(
20456 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20459
20460 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20461 DAG.getBitcast(MVT::v4i64, VBias));
20462 Or = DAG.getBitcast(MVT::v4f64, Or);
20463
20464 if (IsStrict)
20465 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20466 {Op.getOperand(0), Or, VBias});
20467 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20468 }
20469
20470 // The algorithm is the following:
20471 // #ifdef __SSE4_1__
20472 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20473 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20474 // (uint4) 0x53000000, 0xaa);
20475 // #else
20476 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20477 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20478 // #endif
20479 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20480 // return (float4) lo + fhi;
20481
20482 bool Is128 = VecIntVT == MVT::v4i32;
20483 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20484 // If we convert to something else than the supported type, e.g., to v4f64,
20485 // abort early.
20486 if (VecFloatVT != Op->getSimpleValueType(0))
20487 return SDValue();
20488
20489 // In the #idef/#else code, we have in common:
20490 // - The vector of constants:
20491 // -- 0x4b000000
20492 // -- 0x53000000
20493 // - A shift:
20494 // -- v >> 16
20495
20496 // Create the splat vector for 0x4b000000.
20497 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20498 // Create the splat vector for 0x53000000.
20499 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20500
20501 // Create the right shift.
20502 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20503 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20504
20505 SDValue Low, High;
20506 if (Subtarget.hasSSE41()) {
20507 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20508 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20509 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20510 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20511 // Low will be bitcasted right away, so do not bother bitcasting back to its
20512 // original type.
20513 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20514 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20515 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20516 // (uint4) 0x53000000, 0xaa);
20517 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20518 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20519 // High will be bitcasted right away, so do not bother bitcasting back to
20520 // its original type.
20521 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20522 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20523 } else {
20524 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20525 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20526 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20527 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20528
20529 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20530 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20531 }
20532
20533 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20534 SDValue VecCstFSub = DAG.getConstantFP(
20535 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20536
20537 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20538 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20539 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20540 // enabled. See PR24512.
20541 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20542 // TODO: Are there any fast-math-flags to propagate here?
20543 // (float4) lo;
20544 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20545 // return (float4) lo + fhi;
20546 if (IsStrict) {
20547 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20548 {Op.getOperand(0), HighBitcast, VecCstFSub});
20549 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20550 {FHigh.getValue(1), LowBitcast, FHigh});
20551 }
20552
20553 SDValue FHigh =
20554 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20555 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20556}
20557
20559 const X86Subtarget &Subtarget) {
20560 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20561 SDValue N0 = Op.getOperand(OpNo);
20562 MVT SrcVT = N0.getSimpleValueType();
20563
20564 switch (SrcVT.SimpleTy) {
20565 default:
20566 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20567 case MVT::v2i32:
20568 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20569 case MVT::v4i32:
20570 case MVT::v8i32:
20571 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20572 case MVT::v2i64:
20573 case MVT::v4i64:
20574 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20575 }
20576}
20577
20578SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20579 SelectionDAG &DAG) const {
20580 bool IsStrict = Op->isStrictFPOpcode();
20581 unsigned OpNo = IsStrict ? 1 : 0;
20582 SDValue Src = Op.getOperand(OpNo);
20583 SDLoc dl(Op);
20584 auto PtrVT = getPointerTy(DAG.getDataLayout());
20585 MVT SrcVT = Src.getSimpleValueType();
20586 MVT DstVT = Op->getSimpleValueType(0);
20587 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20588
20589 // Bail out when we don't have native conversion instructions.
20590 if (DstVT == MVT::f128)
20591 return SDValue();
20592
20593 if (isSoftF16(DstVT, Subtarget))
20594 return promoteXINT_TO_FP(Op, dl, DAG);
20595 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20596 return Op;
20597
20598 if (DstVT.isVector())
20599 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20600
20601 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20602 return LowerWin64_INT128_TO_FP(Op, DAG);
20603
20604 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20605 return Extract;
20606
20607 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20608 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20609 // Conversions from unsigned i32 to f32/f64 are legal,
20610 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20611 return Op;
20612 }
20613
20614 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20615 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20616 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20617 if (IsStrict)
20618 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20619 {Chain, Src});
20620 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20621 }
20622
20623 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20624 return V;
20625 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20626 return V;
20627
20628 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20629 // infinity. It produces -0.0, so disable under strictfp.
20630 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20631 !IsStrict)
20632 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20633 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20634 // negative infinity. So disable under strictfp. Using FILD instead.
20635 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20636 !IsStrict)
20637 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20638 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20639 (DstVT == MVT::f32 || DstVT == MVT::f64))
20640 return SDValue();
20641
20642 // Make a 64-bit buffer, and use it to build an FILD.
20643 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20644 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20645 Align SlotAlign(8);
20646 MachinePointerInfo MPI =
20648 if (SrcVT == MVT::i32) {
20649 SDValue OffsetSlot =
20650 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20651 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20652 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20653 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20654 std::pair<SDValue, SDValue> Tmp =
20655 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20656 if (IsStrict)
20657 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20658
20659 return Tmp.first;
20660 }
20661
20662 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20663 SDValue ValueToStore = Src;
20664 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20665 // Bitcasting to f64 here allows us to do a single 64-bit store from
20666 // an SSE register, avoiding the store forwarding penalty that would come
20667 // with two 32-bit stores.
20668 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20669 }
20670 SDValue Store =
20671 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20672 // For i64 source, we need to add the appropriate power of 2 if the input
20673 // was negative. We must be careful to do the computation in x87 extended
20674 // precision, not in SSE.
20675 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20676 SDValue Ops[] = {Store, StackSlot};
20677 SDValue Fild =
20678 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20679 SlotAlign, MachineMemOperand::MOLoad);
20680 Chain = Fild.getValue(1);
20681
20682 // Check whether the sign bit is set.
20683 SDValue SignSet = DAG.getSetCC(
20684 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20685 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20686
20687 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20688 APInt FF(64, 0x5F80000000000000ULL);
20689 SDValue FudgePtr =
20690 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20691 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20692
20693 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20694 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20695 SDValue Four = DAG.getIntPtrConstant(4, dl);
20696 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20697 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20698
20699 // Load the value out, extending it from f32 to f80.
20700 SDValue Fudge = DAG.getExtLoad(
20701 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20703 CPAlignment);
20704 Chain = Fudge.getValue(1);
20705 // Extend everything to 80 bits to force it to be done on x87.
20706 // TODO: Are there any fast-math-flags to propagate here?
20707 if (IsStrict) {
20708 unsigned Opc = ISD::STRICT_FADD;
20709 // Windows needs the precision control changed to 80bits around this add.
20710 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20712
20713 SDValue Add =
20714 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20715 // STRICT_FP_ROUND can't handle equal types.
20716 if (DstVT == MVT::f80)
20717 return Add;
20718 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20719 {Add.getValue(1), Add,
20720 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20721 }
20722 unsigned Opc = ISD::FADD;
20723 // Windows needs the precision control changed to 80bits around this add.
20724 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20726
20727 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20728 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20729 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20730}
20731
20732// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20733// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20734// just return an SDValue().
20735// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20736// to i16, i32 or i64, and we lower it to a legal sequence and return the
20737// result.
20738SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20739 bool IsSigned,
20740 SDValue &Chain) const {
20741 bool IsStrict = Op->isStrictFPOpcode();
20742 SDLoc DL(Op);
20743
20744 EVT DstTy = Op.getValueType();
20745 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20746 EVT TheVT = Value.getValueType();
20747 auto PtrVT = getPointerTy(DAG.getDataLayout());
20748
20749 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20750 // f16 must be promoted before using the lowering in this routine.
20751 // fp128 does not use this lowering.
20752 return SDValue();
20753 }
20754
20755 // If using FIST to compute an unsigned i64, we'll need some fixup
20756 // to handle values above the maximum signed i64. A FIST is always
20757 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20758 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20759
20760 // FIXME: This does not generate an invalid exception if the input does not
20761 // fit in i32. PR44019
20762 if (!IsSigned && DstTy != MVT::i64) {
20763 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20764 // The low 32 bits of the fist result will have the correct uint32 result.
20765 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20766 DstTy = MVT::i64;
20767 }
20768
20769 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20770 DstTy.getSimpleVT() >= MVT::i16 &&
20771 "Unknown FP_TO_INT to lower!");
20772
20773 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20774 // stack slot.
20775 MachineFunction &MF = DAG.getMachineFunction();
20776 unsigned MemSize = DstTy.getStoreSize();
20777 int SSFI =
20778 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20779 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20780
20781 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20782
20783 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20784
20785 if (UnsignedFixup) {
20786 //
20787 // Conversion to unsigned i64 is implemented with a select,
20788 // depending on whether the source value fits in the range
20789 // of a signed i64. Let Thresh be the FP equivalent of
20790 // 0x8000000000000000ULL.
20791 //
20792 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20793 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20794 // FistSrc = (Value - FltOfs);
20795 // Fist-to-mem64 FistSrc
20796 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20797 // to XOR'ing the high 32 bits with Adjust.
20798 //
20799 // Being a power of 2, Thresh is exactly representable in all FP formats.
20800 // For X87 we'd like to use the smallest FP type for this constant, but
20801 // for DAG type consistency we have to match the FP operand type.
20802
20803 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20805 bool LosesInfo = false;
20806 if (TheVT == MVT::f64)
20807 // The rounding mode is irrelevant as the conversion should be exact.
20808 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20809 &LosesInfo);
20810 else if (TheVT == MVT::f80)
20811 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20812 APFloat::rmNearestTiesToEven, &LosesInfo);
20813
20814 assert(Status == APFloat::opOK && !LosesInfo &&
20815 "FP conversion should have been exact");
20816
20817 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20818
20819 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20820 *DAG.getContext(), TheVT);
20821 SDValue Cmp;
20822 if (IsStrict) {
20823 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20824 /*IsSignaling*/ true);
20825 Chain = Cmp.getValue(1);
20826 } else {
20827 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20828 }
20829
20830 // Our preferred lowering of
20831 //
20832 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20833 //
20834 // is
20835 //
20836 // (Value >= Thresh) << 63
20837 //
20838 // but since we can get here after LegalOperations, DAGCombine might do the
20839 // wrong thing if we create a select. So, directly create the preferred
20840 // version.
20841 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20842 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20843 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20844
20845 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20846 DAG.getConstantFP(0.0, DL, TheVT));
20847
20848 if (IsStrict) {
20849 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20850 { Chain, Value, FltOfs });
20851 Chain = Value.getValue(1);
20852 } else
20853 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20854 }
20855
20856 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20857
20858 // FIXME This causes a redundant load/store if the SSE-class value is already
20859 // in memory, such as if it is on the callstack.
20860 if (isScalarFPTypeInSSEReg(TheVT)) {
20861 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20862 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20863 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20864 SDValue Ops[] = { Chain, StackSlot };
20865
20866 unsigned FLDSize = TheVT.getStoreSize();
20867 assert(FLDSize <= MemSize && "Stack slot not big enough");
20868 MachineMemOperand *MMO = MF.getMachineMemOperand(
20869 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20870 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20871 Chain = Value.getValue(1);
20872 }
20873
20874 // Build the FP_TO_INT*_IN_MEM
20875 MachineMemOperand *MMO = MF.getMachineMemOperand(
20876 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20877 SDValue Ops[] = { Chain, Value, StackSlot };
20879 DAG.getVTList(MVT::Other),
20880 Ops, DstTy, MMO);
20881
20882 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20883 Chain = Res.getValue(1);
20884
20885 // If we need an unsigned fixup, XOR the result with adjust.
20886 if (UnsignedFixup)
20887 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20888
20889 return Res;
20890}
20891
20893 const X86Subtarget &Subtarget) {
20894 MVT VT = Op.getSimpleValueType();
20895 SDValue In = Op.getOperand(0);
20896 MVT InVT = In.getSimpleValueType();
20897 unsigned Opc = Op.getOpcode();
20898
20899 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20901 "Unexpected extension opcode");
20903 "Expected same number of elements");
20904 assert((VT.getVectorElementType() == MVT::i16 ||
20905 VT.getVectorElementType() == MVT::i32 ||
20906 VT.getVectorElementType() == MVT::i64) &&
20907 "Unexpected element type");
20908 assert((InVT.getVectorElementType() == MVT::i8 ||
20909 InVT.getVectorElementType() == MVT::i16 ||
20910 InVT.getVectorElementType() == MVT::i32) &&
20911 "Unexpected element type");
20912
20913 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20914
20915 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20916 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20917 return splitVectorIntUnary(Op, DAG, dl);
20918 }
20919
20920 if (Subtarget.hasInt256())
20921 return Op;
20922
20923 // Optimize vectors in AVX mode:
20924 //
20925 // v8i16 -> v8i32
20926 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20927 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20928 // Concat upper and lower parts.
20929 //
20930 // v4i32 -> v4i64
20931 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20932 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20933 // Concat upper and lower parts.
20934 //
20935 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20936 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20937
20938 // Short-circuit if we can determine that each 128-bit half is the same value.
20939 // Otherwise, this is difficult to match and optimize.
20940 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20941 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20942 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20943
20944 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20945 SDValue Undef = DAG.getUNDEF(InVT);
20946 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20947 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20948 OpHi = DAG.getBitcast(HalfVT, OpHi);
20949
20950 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20951}
20952
20953// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20954static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20955 const SDLoc &dl, SelectionDAG &DAG) {
20956 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20957 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20958 DAG.getVectorIdxConstant(0, dl));
20959 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20960 DAG.getVectorIdxConstant(8, dl));
20961 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20962 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20963 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20964 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20965}
20966
20968 const X86Subtarget &Subtarget,
20969 SelectionDAG &DAG) {
20970 MVT VT = Op->getSimpleValueType(0);
20971 SDValue In = Op->getOperand(0);
20972 MVT InVT = In.getSimpleValueType();
20973 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20974 unsigned NumElts = VT.getVectorNumElements();
20975
20976 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20977 // avoids a constant pool load.
20978 if (VT.getVectorElementType() != MVT::i8) {
20979 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20980 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20981 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20982 }
20983
20984 // Extend VT if BWI is not supported.
20985 MVT ExtVT = VT;
20986 if (!Subtarget.hasBWI()) {
20987 // If v16i32 is to be avoided, we'll need to split and concatenate.
20988 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20989 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20990
20991 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20992 }
20993
20994 // Widen to 512-bits if VLX is not supported.
20995 MVT WideVT = ExtVT;
20996 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20997 NumElts *= 512 / ExtVT.getSizeInBits();
20998 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20999 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21000 DAG.getVectorIdxConstant(0, DL));
21001 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21002 }
21003
21004 SDValue One = DAG.getConstant(1, DL, WideVT);
21005 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21006
21007 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21008
21009 // Truncate if we had to extend above.
21010 if (VT != ExtVT) {
21011 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21012 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21013 }
21014
21015 // Extract back to 128/256-bit if we widened.
21016 if (WideVT != VT)
21017 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21018 DAG.getVectorIdxConstant(0, DL));
21019
21020 return SelectedVal;
21021}
21022
21024 SelectionDAG &DAG) {
21025 SDValue In = Op.getOperand(0);
21026 MVT SVT = In.getSimpleValueType();
21027 SDLoc DL(Op);
21028
21029 if (SVT.getVectorElementType() == MVT::i1)
21030 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21031
21032 assert(Subtarget.hasAVX() && "Expected AVX support");
21033 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21034}
21035
21036/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21037/// It makes use of the fact that vectors with enough leading sign/zero bits
21038/// prevent the PACKSS/PACKUS from saturating the results.
21039/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21040/// within each 128-bit lane.
21041static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21042 const SDLoc &DL, SelectionDAG &DAG,
21043 const X86Subtarget &Subtarget) {
21044 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21045 "Unexpected PACK opcode");
21046 assert(DstVT.isVector() && "VT not a vector?");
21047
21048 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21049 if (!Subtarget.hasSSE2())
21050 return SDValue();
21051
21052 EVT SrcVT = In.getValueType();
21053
21054 // No truncation required, we might get here due to recursive calls.
21055 if (SrcVT == DstVT)
21056 return In;
21057
21058 unsigned NumElems = SrcVT.getVectorNumElements();
21059 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21060 return SDValue();
21061
21062 unsigned DstSizeInBits = DstVT.getSizeInBits();
21063 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21064 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21065 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21066
21067 LLVMContext &Ctx = *DAG.getContext();
21068 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21069 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21070
21071 // Pack to the largest type possible:
21072 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21073 EVT InVT = MVT::i16, OutVT = MVT::i8;
21074 if (SrcVT.getScalarSizeInBits() > 16 &&
21075 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21076 InVT = MVT::i32;
21077 OutVT = MVT::i16;
21078 }
21079
21080 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21081 // On pre-AVX512, pack the src in both halves to help value tracking.
21082 if (SrcSizeInBits <= 128) {
21083 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21084 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21085 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21086 SDValue LHS = DAG.getBitcast(InVT, In);
21087 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21088 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21089 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21090 Res = DAG.getBitcast(PackedVT, Res);
21091 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21092 }
21093
21094 // Split lower/upper subvectors.
21095 SDValue Lo, Hi;
21096 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21097
21098 // If Hi is undef, then don't bother packing it and widen the result instead.
21099 if (Hi.isUndef()) {
21100 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21101 if (SDValue Res =
21102 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21103 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21104 }
21105
21106 unsigned SubSizeInBits = SrcSizeInBits / 2;
21107 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21108 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21109
21110 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21111 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21112 Lo = DAG.getBitcast(InVT, Lo);
21113 Hi = DAG.getBitcast(InVT, Hi);
21114 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21115 return DAG.getBitcast(DstVT, Res);
21116 }
21117
21118 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21119 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21120 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21121 Lo = DAG.getBitcast(InVT, Lo);
21122 Hi = DAG.getBitcast(InVT, Hi);
21123 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21124
21125 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21126 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21127 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21129 int Scale = 64 / OutVT.getScalarSizeInBits();
21130 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21131 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21132
21133 if (DstVT.is256BitVector())
21134 return DAG.getBitcast(DstVT, Res);
21135
21136 // If 512bit -> 128bit truncate another stage.
21137 Res = DAG.getBitcast(PackedVT, Res);
21138 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21139 }
21140
21141 // Recursively pack lower/upper subvectors, concat result and pack again.
21142 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21143
21144 if (PackedVT.is128BitVector()) {
21145 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21146 // type legalization.
21147 SDValue Res =
21148 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21149 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21150 }
21151
21152 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21153 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21154 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21155 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21156 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21157}
21158
21159/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21160/// e.g. trunc <8 x i32> X to <8 x i16> -->
21161/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21162/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21164 const X86Subtarget &Subtarget,
21165 SelectionDAG &DAG) {
21166 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21167 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21168}
21169
21170/// Truncate using inreg sign extension and X86ISD::PACKSS.
21172 const X86Subtarget &Subtarget,
21173 SelectionDAG &DAG) {
21174 EVT SrcVT = In.getValueType();
21175 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21176 DAG.getValueType(DstVT));
21177 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21178}
21179
21180/// Helper to determine if \p In truncated to \p DstVT has the necessary
21181/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21182/// possibly by converting a SRL node to SRA for sign extension.
21183static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21184 SDValue In, const SDLoc &DL,
21185 SelectionDAG &DAG,
21186 const X86Subtarget &Subtarget,
21187 const SDNodeFlags Flags = SDNodeFlags()) {
21188 // Requires SSE2.
21189 if (!Subtarget.hasSSE2())
21190 return SDValue();
21191
21192 EVT SrcVT = In.getValueType();
21193 EVT DstSVT = DstVT.getVectorElementType();
21194 EVT SrcSVT = SrcVT.getVectorElementType();
21195 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21196 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21197
21198 // Check we have a truncation suited for PACKSS/PACKUS.
21199 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21200 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21201 return SDValue();
21202
21203 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21204 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21205
21206 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21207 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21208 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21209 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21210 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21211 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21212 return SDValue();
21213
21214 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21215 // split this for packing.
21216 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21217 !isFreeToSplitVector(In, DAG) &&
21218 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21219 return SDValue();
21220
21221 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21222 if (Subtarget.hasAVX512() && NumStages > 1)
21223 return SDValue();
21224
21225 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21226 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21227
21228 // Truncate with PACKUS if we are truncating a vector with leading zero
21229 // bits that extend all the way to the packed/truncated value.
21230 // e.g. Masks, zext_in_reg, etc.
21231 // Pre-SSE41 we can only use PACKUSWB.
21232 KnownBits Known = DAG.computeKnownBits(In);
21233 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21234 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21235 PackOpcode = X86ISD::PACKUS;
21236 return In;
21237 }
21238
21239 // Truncate with PACKSS if we are truncating a vector with sign-bits
21240 // that extend all the way to the packed/truncated value.
21241 // e.g. Comparison result, sext_in_reg, etc.
21242 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21243
21244 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21245 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21246 // see through BITCASTs later on and combines/simplifications can't then use
21247 // it.
21248 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21249 !Subtarget.hasAVX512())
21250 return SDValue();
21251
21252 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21253 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21254 MinSignBits < NumSignBits) {
21255 PackOpcode = X86ISD::PACKSS;
21256 return In;
21257 }
21258
21259 // If we have a srl that only generates signbits that we will discard in
21260 // the truncation then we can use PACKSS by converting the srl to a sra.
21261 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21262 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21263 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21264 if (*ShAmt == MinSignBits) {
21265 PackOpcode = X86ISD::PACKSS;
21266 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21267 }
21268 }
21269
21270 return SDValue();
21271}
21272
21273/// This function lowers a vector truncation of 'extended sign-bits' or
21274/// 'extended zero-bits' values.
21275/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21277 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21278 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21279 MVT SrcVT = In.getSimpleValueType();
21280 MVT DstSVT = DstVT.getVectorElementType();
21281 MVT SrcSVT = SrcVT.getVectorElementType();
21282 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21283 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21284 return SDValue();
21285
21286 // If the upper half of the source is undef, then attempt to split and
21287 // only truncate the lower half.
21288 if (DstVT.getSizeInBits() >= 128) {
21289 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21290 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21291 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21292 Subtarget, DAG))
21293 return widenSubVector(Res, false, Subtarget, DAG, DL,
21294 DstVT.getSizeInBits());
21295 }
21296 }
21297
21298 unsigned PackOpcode;
21299 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21300 Subtarget, Flags))
21301 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21302
21303 return SDValue();
21304}
21305
21306/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21307/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21309 const X86Subtarget &Subtarget,
21310 SelectionDAG &DAG) {
21311 MVT SrcVT = In.getSimpleValueType();
21312 MVT DstSVT = DstVT.getVectorElementType();
21313 MVT SrcSVT = SrcVT.getVectorElementType();
21314 unsigned NumElems = DstVT.getVectorNumElements();
21315 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21316 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21317 NumElems >= 8))
21318 return SDValue();
21319
21320 // SSSE3's pshufb results in less instructions in the cases below.
21321 if (Subtarget.hasSSSE3() && NumElems == 8) {
21322 if (SrcSVT == MVT::i16)
21323 return SDValue();
21324 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21325 return SDValue();
21326 }
21327
21328 // If the upper half of the source is undef, then attempt to split and
21329 // only truncate the lower half.
21330 if (DstVT.getSizeInBits() >= 128) {
21331 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21332 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21333 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21334 return widenSubVector(Res, false, Subtarget, DAG, DL,
21335 DstVT.getSizeInBits());
21336 }
21337 }
21338
21339 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21340 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21341 // truncate 2 x v4i32 to v8i16.
21342 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21343 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21344
21345 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21346 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21347
21348 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21349 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21350 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21351 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21352 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21353 }
21354
21355 return SDValue();
21356}
21357
21359 SelectionDAG &DAG,
21360 const X86Subtarget &Subtarget) {
21361 MVT VT = Op.getSimpleValueType();
21362 SDValue In = Op.getOperand(0);
21363 MVT InVT = In.getSimpleValueType();
21364 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21365
21366 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21367 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21368 if (InVT.getScalarSizeInBits() <= 16) {
21369 if (Subtarget.hasBWI()) {
21370 // legal, will go to VPMOVB2M, VPMOVW2M
21371 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21372 // We need to shift to get the lsb into sign position.
21373 // Shift packed bytes not supported natively, bitcast to word
21374 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21375 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21376 DAG.getBitcast(ExtVT, In),
21377 DAG.getConstant(ShiftInx, DL, ExtVT));
21378 In = DAG.getBitcast(InVT, In);
21379 }
21380 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21381 In, ISD::SETGT);
21382 }
21383 // Use TESTD/Q, extended vector to packed dword/qword.
21384 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21385 "Unexpected vector type.");
21386 unsigned NumElts = InVT.getVectorNumElements();
21387 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21388 // We need to change to a wider element type that we have support for.
21389 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21390 // For 16 element vectors we extend to v16i32 unless we are explicitly
21391 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21392 // we need to split into two 8 element vectors which we can extend to v8i32,
21393 // truncate and concat the results. There's an additional complication if
21394 // the original type is v16i8. In that case we can't split the v16i8
21395 // directly, so we need to shuffle high elements to low and use
21396 // sign_extend_vector_inreg.
21397 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21398 SDValue Lo, Hi;
21399 if (InVT == MVT::v16i8) {
21400 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21401 Hi = DAG.getVectorShuffle(
21402 InVT, DL, In, In,
21403 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21404 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21405 } else {
21406 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21407 Lo = extract128BitVector(In, 0, DAG, DL);
21408 Hi = extract128BitVector(In, 8, DAG, DL);
21409 }
21410 // We're split now, just emit two truncates and a concat. The two
21411 // truncates will trigger legalization to come back to this function.
21412 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21413 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21414 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21415 }
21416 // We either have 8 elements or we're allowed to use 512-bit vectors.
21417 // If we have VLX, we want to use the narrowest vector that can get the
21418 // job done so we use vXi32.
21419 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21420 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21421 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21422 InVT = ExtVT;
21423 ShiftInx = InVT.getScalarSizeInBits() - 1;
21424 }
21425
21426 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21427 // We need to shift to get the lsb into sign position.
21428 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21429 DAG.getConstant(ShiftInx, DL, InVT));
21430 }
21431 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21432 if (Subtarget.hasDQI())
21433 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21434 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21435}
21436
21437SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21438 SDLoc DL(Op);
21439 MVT VT = Op.getSimpleValueType();
21440 SDValue In = Op.getOperand(0);
21441 MVT InVT = In.getSimpleValueType();
21443 "Invalid TRUNCATE operation");
21444
21445 // If we're called by the type legalizer, handle a few cases.
21446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21447 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21448 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21449 VT.is128BitVector() && Subtarget.hasAVX512()) {
21450 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21451 "Unexpected subtarget!");
21452 // The default behavior is to truncate one step, concatenate, and then
21453 // truncate the remainder. We'd rather produce two 64-bit results and
21454 // concatenate those.
21455 SDValue Lo, Hi;
21456 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21457
21458 EVT LoVT, HiVT;
21459 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21460
21461 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21462 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21463 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21464 }
21465
21466 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21467 if (!Subtarget.hasAVX512() ||
21468 (InVT.is512BitVector() && VT.is256BitVector()))
21470 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21471 return SignPack;
21472
21473 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21474 if (!Subtarget.hasAVX512())
21475 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21476
21477 // Otherwise let default legalization handle it.
21478 return SDValue();
21479 }
21480
21481 if (VT.getVectorElementType() == MVT::i1)
21482 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21483
21484 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21485 // concat from subvectors to use VPTRUNC etc.
21486 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21488 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21489 return SignPack;
21490
21491 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21492 if (Subtarget.hasAVX512()) {
21493 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21494 assert(VT == MVT::v32i8 && "Unexpected VT!");
21495 return splitVectorIntUnary(Op, DAG, DL);
21496 }
21497
21498 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21499 // and then truncate that. But we should only do that if we haven't been
21500 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21501 // handled by isel patterns.
21502 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21503 Subtarget.canExtendTo512DQ())
21504 return Op;
21505 }
21506
21507 // Handle truncation of V256 to V128 using shuffles.
21508 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21509
21510 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21511 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21512 if (Subtarget.hasInt256()) {
21513 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21514 In = DAG.getBitcast(MVT::v8i32, In);
21515 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21516 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21517 DAG.getVectorIdxConstant(0, DL));
21518 }
21519
21520 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21521 DAG.getVectorIdxConstant(0, DL));
21522 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21523 DAG.getVectorIdxConstant(2, DL));
21524 static const int ShufMask[] = {0, 2, 4, 6};
21525 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21526 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21527 }
21528
21529 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21530 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21531 if (Subtarget.hasInt256()) {
21532 // The PSHUFB mask:
21533 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21534 -1, -1, -1, -1, -1, -1, -1, -1,
21535 16, 17, 20, 21, 24, 25, 28, 29,
21536 -1, -1, -1, -1, -1, -1, -1, -1 };
21537 In = DAG.getBitcast(MVT::v32i8, In);
21538 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21539 In = DAG.getBitcast(MVT::v4i64, In);
21540
21541 static const int ShufMask2[] = {0, 2, -1, -1};
21542 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21543 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21544 DAG.getVectorIdxConstant(0, DL));
21545 return DAG.getBitcast(MVT::v8i16, In);
21546 }
21547
21548 return Subtarget.hasSSE41()
21549 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21550 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21551 }
21552
21553 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21554 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21555
21556 llvm_unreachable("All 256->128 cases should have been handled above!");
21557}
21558
21559// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21560// behaves on out of range inputs to generate optimized conversions.
21562 SelectionDAG &DAG,
21563 const X86Subtarget &Subtarget) {
21564 MVT SrcVT = Src.getSimpleValueType();
21565 unsigned DstBits = VT.getScalarSizeInBits();
21566 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21567
21568 // Calculate the converted result for values in the range 0 to
21569 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21570 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21571 SDValue Big =
21572 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21573 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21574 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21575
21576 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21577 // and only if the value was out of range. So we can use that
21578 // as our indicator that we rather use "Big" instead of "Small".
21579 //
21580 // Use "Small" if "IsOverflown" has all bits cleared
21581 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21582
21583 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21584 // use the slightly slower blendv select instead.
21585 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21586 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21587 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21588 }
21589
21590 SDValue IsOverflown =
21591 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21592 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21593 return DAG.getNode(ISD::OR, dl, VT, Small,
21594 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21595}
21596
21597SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21598 bool IsStrict = Op->isStrictFPOpcode();
21599 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21600 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21601 bool HasVLX = Subtarget.hasVLX();
21602 MVT VT = Op->getSimpleValueType(0);
21603 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21604 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21605 MVT SrcVT = Src.getSimpleValueType();
21606 SDLoc dl(Op);
21607
21608 SDValue Res;
21609 if (isSoftF16(SrcVT, Subtarget)) {
21610 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21611 if (IsStrict)
21612 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21613 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21614 {NVT, MVT::Other}, {Chain, Src})});
21615 return DAG.getNode(Op.getOpcode(), dl, VT,
21616 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21617 } else if (isTypeLegal(SrcVT) &&
21618 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21619 return Op;
21620 }
21621
21622 if (VT.isVector()) {
21623 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21624 MVT ResVT = MVT::v4i32;
21625 MVT TruncVT = MVT::v4i1;
21626 unsigned Opc;
21627 if (IsStrict)
21629 else
21630 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21631
21632 if (!IsSigned && !HasVLX) {
21633 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21634 // Widen to 512-bits.
21635 ResVT = MVT::v8i32;
21636 TruncVT = MVT::v8i1;
21637 Opc = Op.getOpcode();
21638 // Need to concat with zero vector for strict fp to avoid spurious
21639 // exceptions.
21640 // TODO: Should we just do this for non-strict as well?
21641 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21642 : DAG.getUNDEF(MVT::v8f64);
21643 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21644 DAG.getVectorIdxConstant(0, dl));
21645 }
21646 if (IsStrict) {
21647 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21648 Chain = Res.getValue(1);
21649 } else {
21650 Res = DAG.getNode(Opc, dl, ResVT, Src);
21651 }
21652
21653 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21654 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21655 DAG.getVectorIdxConstant(0, dl));
21656 if (IsStrict)
21657 return DAG.getMergeValues({Res, Chain}, dl);
21658 return Res;
21659 }
21660
21661 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21662 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21663 VT == MVT::v32i16)
21664 return Op;
21665
21666 MVT ResVT = VT;
21667 MVT EleVT = VT.getVectorElementType();
21668 if (EleVT != MVT::i64)
21669 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21670
21671 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21672 SDValue Tmp =
21673 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21674 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21675 Ops[0] = Src;
21676 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21677 }
21678
21679 if (!HasVLX) {
21680 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21681 // Widen to 512-bits.
21682 unsigned IntSize = EleVT.getSizeInBits();
21683 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21684 ResVT = MVT::getVectorVT(EleVT, Num);
21685 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21686 Subtarget, DAG, dl);
21687 }
21688
21689 if (IsStrict) {
21690 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21692 dl, {ResVT, MVT::Other}, {Chain, Src});
21693 Chain = Res.getValue(1);
21694 } else {
21695 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21696 ResVT, Src);
21697 }
21698
21699 // TODO: Need to add exception check code for strict FP.
21700 if (EleVT.getSizeInBits() < 16) {
21701 if (HasVLX)
21702 ResVT = MVT::getVectorVT(EleVT, 8);
21703 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21704 }
21705
21706 if (ResVT != VT)
21707 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21708 DAG.getVectorIdxConstant(0, dl));
21709
21710 if (IsStrict)
21711 return DAG.getMergeValues({Res, Chain}, dl);
21712 return Res;
21713 }
21714
21715 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21716 if (VT.getVectorElementType() == MVT::i16) {
21717 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21718 SrcVT.getVectorElementType() == MVT::f64) &&
21719 "Expected f32/f64 vector!");
21720 MVT NVT = VT.changeVectorElementType(MVT::i32);
21721 if (IsStrict) {
21722 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21724 dl, {NVT, MVT::Other}, {Chain, Src});
21725 Chain = Res.getValue(1);
21726 } else {
21727 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21728 NVT, Src);
21729 }
21730
21731 // TODO: Need to add exception check code for strict FP.
21732 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21733
21734 if (IsStrict)
21735 return DAG.getMergeValues({Res, Chain}, dl);
21736 return Res;
21737 }
21738
21739 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21740 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21741 assert(!IsSigned && "Expected unsigned conversion!");
21742 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21743 return Op;
21744 }
21745
21746 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21747 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21748 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21749 Subtarget.useAVX512Regs()) {
21750 assert(!IsSigned && "Expected unsigned conversion!");
21751 assert(!Subtarget.hasVLX() && "Unexpected features!");
21752 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21753 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21754 // Need to concat with zero vector for strict fp to avoid spurious
21755 // exceptions.
21756 // TODO: Should we just do this for non-strict as well?
21757 SDValue Tmp =
21758 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21759 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21760 DAG.getVectorIdxConstant(0, dl));
21761
21762 if (IsStrict) {
21763 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21764 {Chain, Src});
21765 Chain = Res.getValue(1);
21766 } else {
21767 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21768 }
21769
21770 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21771 DAG.getVectorIdxConstant(0, dl));
21772
21773 if (IsStrict)
21774 return DAG.getMergeValues({Res, Chain}, dl);
21775 return Res;
21776 }
21777
21778 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21779 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21780 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21781 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21782 assert(!Subtarget.hasVLX() && "Unexpected features!");
21783 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21784 // Need to concat with zero vector for strict fp to avoid spurious
21785 // exceptions.
21786 // TODO: Should we just do this for non-strict as well?
21787 SDValue Tmp =
21788 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21789 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21790 DAG.getVectorIdxConstant(0, dl));
21791
21792 if (IsStrict) {
21793 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21794 {Chain, Src});
21795 Chain = Res.getValue(1);
21796 } else {
21797 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21798 }
21799
21800 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21801 DAG.getVectorIdxConstant(0, dl));
21802
21803 if (IsStrict)
21804 return DAG.getMergeValues({Res, Chain}, dl);
21805 return Res;
21806 }
21807
21808 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21809 if (!Subtarget.hasVLX()) {
21810 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21811 // legalizer and then widened again by vector op legalization.
21812 if (!IsStrict)
21813 return SDValue();
21814
21815 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21816 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21817 {Src, Zero, Zero, Zero});
21818 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21819 {Chain, Tmp});
21820 SDValue Chain = Tmp.getValue(1);
21821 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21822 DAG.getVectorIdxConstant(0, dl));
21823 return DAG.getMergeValues({Tmp, Chain}, dl);
21824 }
21825
21826 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21827 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21828 DAG.getUNDEF(MVT::v2f32));
21829 if (IsStrict) {
21830 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21832 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21833 }
21834 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21835 return DAG.getNode(Opc, dl, VT, Tmp);
21836 }
21837
21838 // Generate optimized instructions for pre AVX512 unsigned conversions from
21839 // vXf32 to vXi32.
21840 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21841 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21842 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21843 assert(!IsSigned && "Expected unsigned conversion!");
21844 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21845 }
21846
21847 return SDValue();
21848 }
21849
21850 assert(!VT.isVector());
21851
21852 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21853
21854 if (!IsSigned && UseSSEReg) {
21855 // Conversions from f32/f64 with AVX512 should be legal.
21856 if (Subtarget.hasAVX512())
21857 return Op;
21858
21859 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21860 // behaves on out of range inputs to generate optimized conversions.
21861 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21862 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21863 unsigned DstBits = VT.getScalarSizeInBits();
21864 APInt UIntLimit = APInt::getSignMask(DstBits);
21865 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21866 DAG.getConstant(UIntLimit, dl, VT));
21867 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21868
21869 // Calculate the converted result for values in the range:
21870 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21871 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21872 SDValue Small =
21873 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21874 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21875 SDValue Big = DAG.getNode(
21876 X86ISD::CVTTS2SI, dl, VT,
21877 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21878 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21879
21880 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21881 // and only if the value was out of range. So we can use that
21882 // as our indicator that we rather use "Big" instead of "Small".
21883 //
21884 // Use "Small" if "IsOverflown" has all bits cleared
21885 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21886 SDValue IsOverflown = DAG.getNode(
21887 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21888 return DAG.getNode(ISD::OR, dl, VT, Small,
21889 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21890 }
21891
21892 // Use default expansion for i64.
21893 if (VT == MVT::i64)
21894 return SDValue();
21895
21896 assert(VT == MVT::i32 && "Unexpected VT!");
21897
21898 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21899 // FIXME: This does not generate an invalid exception if the input does not
21900 // fit in i32. PR44019
21901 if (Subtarget.is64Bit()) {
21902 if (IsStrict) {
21903 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21904 {Chain, Src});
21905 Chain = Res.getValue(1);
21906 } else
21907 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21908
21909 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21910 if (IsStrict)
21911 return DAG.getMergeValues({Res, Chain}, dl);
21912 return Res;
21913 }
21914
21915 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21916 // use fisttp which will be handled later.
21917 if (!Subtarget.hasSSE3())
21918 return SDValue();
21919 }
21920
21921 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21922 // FIXME: This does not generate an invalid exception if the input does not
21923 // fit in i16. PR44019
21924 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21925 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21926 if (IsStrict) {
21927 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21928 {Chain, Src});
21929 Chain = Res.getValue(1);
21930 } else
21931 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21932
21933 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21934 if (IsStrict)
21935 return DAG.getMergeValues({Res, Chain}, dl);
21936 return Res;
21937 }
21938
21939 // If this is a FP_TO_SINT using SSEReg we're done.
21940 if (UseSSEReg && IsSigned)
21941 return Op;
21942
21943 // fp128 needs to use a libcall.
21944 if (SrcVT == MVT::f128) {
21945 RTLIB::Libcall LC;
21946 if (IsSigned)
21947 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21948 else
21949 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21950
21951 MakeLibCallOptions CallOptions;
21952 std::pair<SDValue, SDValue> Tmp =
21953 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21954
21955 if (IsStrict)
21956 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21957
21958 return Tmp.first;
21959 }
21960
21961 // Fall back to X87.
21962 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21963 if (IsStrict)
21964 return DAG.getMergeValues({V, Chain}, dl);
21965 return V;
21966 }
21967
21968 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21969}
21970
21971SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21972 SelectionDAG &DAG) const {
21973 SDValue Src = Op.getOperand(0);
21974 EVT DstVT = Op.getSimpleValueType();
21975 MVT SrcVT = Src.getSimpleValueType();
21976
21977 if (SrcVT.isVector())
21978 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21979
21980 if (SrcVT == MVT::f16)
21981 return SDValue();
21982
21983 // If the source is in an SSE register, the node is Legal.
21984 if (isScalarFPTypeInSSEReg(SrcVT))
21985 return Op;
21986
21987 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21988}
21989
21990SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21991 SelectionDAG &DAG) const {
21992 EVT DstVT = N->getValueType(0);
21993 SDValue Src = N->getOperand(0);
21994 EVT SrcVT = Src.getValueType();
21995
21996 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21997 // f16 must be promoted before using the lowering in this routine.
21998 // fp128 does not use this lowering.
21999 return SDValue();
22000 }
22001
22002 SDLoc DL(N);
22003 SDValue Chain = DAG.getEntryNode();
22004
22005 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22006
22007 // If we're converting from SSE, the stack slot needs to hold both types.
22008 // Otherwise it only needs to hold the DstVT.
22009 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22010 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22011 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22012 MachinePointerInfo MPI =
22014
22015 if (UseSSE) {
22016 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22017 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22018 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22019 SDValue Ops[] = { Chain, StackPtr };
22020
22021 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22022 /*Align*/ std::nullopt,
22024 Chain = Src.getValue(1);
22025 }
22026
22027 SDValue StoreOps[] = { Chain, Src, StackPtr };
22028 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22029 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22031
22032 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22033}
22034
22035SDValue
22036X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22037 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22038 // but making use of X86 specifics to produce better instruction sequences.
22039 SDNode *Node = Op.getNode();
22040 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22041 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22042 SDLoc dl(SDValue(Node, 0));
22043 SDValue Src = Node->getOperand(0);
22044
22045 // There are three types involved here: SrcVT is the source floating point
22046 // type, DstVT is the type of the result, and TmpVT is the result of the
22047 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22048 // DstVT).
22049 EVT SrcVT = Src.getValueType();
22050 EVT DstVT = Node->getValueType(0);
22051 EVT TmpVT = DstVT;
22052
22053 // This code is only for floats and doubles. Fall back to generic code for
22054 // anything else.
22055 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22056 return SDValue();
22057
22058 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22059 unsigned SatWidth = SatVT.getScalarSizeInBits();
22060 unsigned DstWidth = DstVT.getScalarSizeInBits();
22061 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22062 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22063 "Expected saturation width smaller than result width");
22064
22065 // Promote result of FP_TO_*INT to at least 32 bits.
22066 if (TmpWidth < 32) {
22067 TmpVT = MVT::i32;
22068 TmpWidth = 32;
22069 }
22070
22071 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22072 // us to use a native signed conversion instead.
22073 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22074 TmpVT = MVT::i64;
22075 TmpWidth = 64;
22076 }
22077
22078 // If the saturation width is smaller than the size of the temporary result,
22079 // we can always use signed conversion, which is native.
22080 if (SatWidth < TmpWidth)
22081 FpToIntOpcode = ISD::FP_TO_SINT;
22082
22083 // Determine minimum and maximum integer values and their corresponding
22084 // floating-point values.
22085 APInt MinInt, MaxInt;
22086 if (IsSigned) {
22087 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22088 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22089 } else {
22090 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22091 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22092 }
22093
22094 const fltSemantics &Sem = SrcVT.getFltSemantics();
22095 APFloat MinFloat(Sem);
22096 APFloat MaxFloat(Sem);
22097
22098 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22099 MinInt, IsSigned, APFloat::rmTowardZero);
22100 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22101 MaxInt, IsSigned, APFloat::rmTowardZero);
22102 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22103 && !(MaxStatus & APFloat::opStatus::opInexact);
22104
22105 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22106 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22107
22108 // If the integer bounds are exactly representable as floats, emit a
22109 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22110 if (AreExactFloatBounds) {
22111 if (DstVT != TmpVT) {
22112 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22113 SDValue MinClamped = DAG.getNode(
22114 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22115 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22116 SDValue BothClamped = DAG.getNode(
22117 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22118 // Convert clamped value to integer.
22119 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22120
22121 // NaN will become INDVAL, with the top bit set and the rest zero.
22122 // Truncation will discard the top bit, resulting in zero.
22123 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22124 }
22125
22126 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22127 SDValue MinClamped = DAG.getNode(
22128 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22129 // Clamp by MaxFloat from above. NaN cannot occur.
22130 SDValue BothClamped = DAG.getNode(
22131 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22132 // Convert clamped value to integer.
22133 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22134
22135 if (!IsSigned) {
22136 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22137 // which is zero.
22138 return FpToInt;
22139 }
22140
22141 // Otherwise, select zero if Src is NaN.
22142 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22143 return DAG.getSelectCC(
22144 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22145 }
22146
22147 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22148 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22149
22150 // Result of direct conversion, which may be selected away.
22151 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22152
22153 if (DstVT != TmpVT) {
22154 // NaN will become INDVAL, with the top bit set and the rest zero.
22155 // Truncation will discard the top bit, resulting in zero.
22156 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22157 }
22158
22159 SDValue Select = FpToInt;
22160 // For signed conversions where we saturate to the same size as the
22161 // result type of the fptoi instructions, INDVAL coincides with integer
22162 // minimum, so we don't need to explicitly check it.
22163 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22164 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22165 // MinInt if Src is NaN.
22166 Select = DAG.getSelectCC(
22167 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22168 }
22169
22170 // If Src OGT MaxFloat, select MaxInt.
22171 Select = DAG.getSelectCC(
22172 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22173
22174 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22175 // is already zero. The promoted case was already handled above.
22176 if (!IsSigned || DstVT != TmpVT) {
22177 return Select;
22178 }
22179
22180 // Otherwise, select 0 if Src is NaN.
22181 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22182 return DAG.getSelectCC(
22183 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22184}
22185
22186SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22187 bool IsStrict = Op->isStrictFPOpcode();
22188
22189 SDLoc DL(Op);
22190 MVT VT = Op.getSimpleValueType();
22191 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22192 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22193 MVT SVT = In.getSimpleValueType();
22194
22195 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22196 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22197 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22198 !Subtarget.getTargetTriple().isOSDarwin()))
22199 return SDValue();
22200
22201 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22202 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22203 return Op;
22204
22205 if (SVT == MVT::f16) {
22206 if (Subtarget.hasFP16())
22207 return Op;
22208
22209 if (VT != MVT::f32) {
22210 if (IsStrict)
22211 return DAG.getNode(
22212 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22213 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22214 {MVT::f32, MVT::Other}, {Chain, In})});
22215
22216 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22217 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22218 }
22219
22220 if (!Subtarget.hasF16C()) {
22221 if (!Subtarget.getTargetTriple().isOSDarwin())
22222 return SDValue();
22223
22224 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22225
22226 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22227 TargetLowering::CallLoweringInfo CLI(DAG);
22228 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22229
22230 In = DAG.getBitcast(MVT::i16, In);
22232 TargetLowering::ArgListEntry Entry(
22233 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22234 Entry.IsSExt = false;
22235 Entry.IsZExt = true;
22236 Args.push_back(Entry);
22237
22239 getLibcallName(RTLIB::FPEXT_F16_F32),
22241 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22242 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22243 std::move(Args));
22244
22245 SDValue Res;
22246 std::tie(Res,Chain) = LowerCallTo(CLI);
22247 if (IsStrict)
22248 Res = DAG.getMergeValues({Res, Chain}, DL);
22249
22250 return Res;
22251 }
22252
22253 In = DAG.getBitcast(MVT::i16, In);
22254 SDValue Res;
22255 if (IsStrict) {
22256 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22257 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22258 DAG.getVectorIdxConstant(0, DL));
22259 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22260 {Chain, In});
22261 Chain = Res.getValue(1);
22262 } else {
22263 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22264 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22265 DAG.getUNDEF(MVT::v4i32), In,
22266 DAG.getVectorIdxConstant(0, DL));
22267 In = DAG.getBitcast(MVT::v8i16, In);
22268 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22269 DAG.getTargetConstant(4, DL, MVT::i32));
22270 }
22271 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22272 DAG.getVectorIdxConstant(0, DL));
22273 if (IsStrict)
22274 return DAG.getMergeValues({Res, Chain}, DL);
22275 return Res;
22276 }
22277
22278 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22279 return Op;
22280
22281 if (SVT.getVectorElementType() == MVT::f16) {
22282 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22283 return Op;
22284 assert(Subtarget.hasF16C() && "Unexpected features!");
22285 if (SVT == MVT::v2f16)
22286 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22287 DAG.getUNDEF(MVT::v2f16));
22288 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22289 DAG.getUNDEF(MVT::v4f16));
22290 if (IsStrict)
22291 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22292 {Op->getOperand(0), Res});
22293 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22294 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22295 return Op;
22296 }
22297
22298 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22299
22300 SDValue Res =
22301 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22302 if (IsStrict)
22303 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22304 {Op->getOperand(0), Res});
22305 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22306}
22307
22308SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22309 bool IsStrict = Op->isStrictFPOpcode();
22310
22311 SDLoc DL(Op);
22312 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22313 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22314 MVT VT = Op.getSimpleValueType();
22315 MVT SVT = In.getSimpleValueType();
22316
22317 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22318 return SDValue();
22319
22320 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22321 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22322 if (!Subtarget.getTargetTriple().isOSDarwin())
22323 return SDValue();
22324
22325 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22326 TargetLowering::CallLoweringInfo CLI(DAG);
22327 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22328
22330 TargetLowering::ArgListEntry Entry(
22331 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22332 Entry.IsSExt = false;
22333 Entry.IsZExt = true;
22334 Args.push_back(Entry);
22335
22337 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22338 : RTLIB::FPROUND_F32_F16),
22340 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22341 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22342 std::move(Args));
22343
22344 SDValue Res;
22345 std::tie(Res, Chain) = LowerCallTo(CLI);
22346
22347 Res = DAG.getBitcast(MVT::f16, Res);
22348
22349 if (IsStrict)
22350 Res = DAG.getMergeValues({Res, Chain}, DL);
22351
22352 return Res;
22353 }
22354
22355 if (VT.getScalarType() == MVT::bf16) {
22356 if (SVT.getScalarType() == MVT::f32 &&
22357 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22358 Subtarget.hasAVXNECONVERT()))
22359 return Op;
22360 return SDValue();
22361 }
22362
22363 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22364 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22365 return SDValue();
22366
22367 if (VT.isVector())
22368 return Op;
22369
22370 SDValue Res;
22372 MVT::i32);
22373 if (IsStrict) {
22374 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22375 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22376 DAG.getVectorIdxConstant(0, DL));
22377 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22378 {Chain, Res, Rnd});
22379 Chain = Res.getValue(1);
22380 } else {
22381 // FIXME: Should we use zeros for upper elements for non-strict?
22382 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22383 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22384 }
22385
22386 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22387 DAG.getVectorIdxConstant(0, DL));
22388 Res = DAG.getBitcast(MVT::f16, Res);
22389
22390 if (IsStrict)
22391 return DAG.getMergeValues({Res, Chain}, DL);
22392
22393 return Res;
22394 }
22395
22396 return Op;
22397}
22398
22400 bool IsStrict = Op->isStrictFPOpcode();
22401 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22402 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22403 "Unexpected VT!");
22404
22405 SDLoc dl(Op);
22406 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22407 DAG.getConstant(0, dl, MVT::v8i16), Src,
22408 DAG.getVectorIdxConstant(0, dl));
22409
22410 SDValue Chain;
22411 if (IsStrict) {
22412 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22413 {Op.getOperand(0), Res});
22414 Chain = Res.getValue(1);
22415 } else {
22416 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22417 }
22418
22419 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22420 DAG.getVectorIdxConstant(0, dl));
22421
22422 if (IsStrict)
22423 return DAG.getMergeValues({Res, Chain}, dl);
22424
22425 return Res;
22426}
22427
22429 bool IsStrict = Op->isStrictFPOpcode();
22430 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22431 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22432 "Unexpected VT!");
22433
22434 SDLoc dl(Op);
22435 SDValue Res, Chain;
22436 if (IsStrict) {
22437 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22438 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22439 DAG.getVectorIdxConstant(0, dl));
22440 Res = DAG.getNode(
22441 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22442 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22443 Chain = Res.getValue(1);
22444 } else {
22445 // FIXME: Should we use zeros for upper elements for non-strict?
22446 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22447 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22448 DAG.getTargetConstant(4, dl, MVT::i32));
22449 }
22450
22451 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22452 DAG.getVectorIdxConstant(0, dl));
22453
22454 if (IsStrict)
22455 return DAG.getMergeValues({Res, Chain}, dl);
22456
22457 return Res;
22458}
22459
22460SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22461 SelectionDAG &DAG) const {
22462 SDLoc DL(Op);
22463
22464 MVT SVT = Op.getOperand(0).getSimpleValueType();
22465 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22466 Subtarget.hasAVXNECONVERT())) {
22467 SDValue Res;
22468 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22469 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22470 Res = DAG.getBitcast(MVT::v8i16, Res);
22471 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22472 DAG.getVectorIdxConstant(0, DL));
22473 }
22474
22475 MakeLibCallOptions CallOptions;
22476 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22477 SDValue Res =
22478 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22479 return DAG.getBitcast(MVT::i16, Res);
22480}
22481
22482/// Depending on uarch and/or optimizing for size, we might prefer to use a
22483/// vector operation in place of the typical scalar operation.
22485 SelectionDAG &DAG,
22486 const X86Subtarget &Subtarget) {
22487 // If both operands have other uses, this is probably not profitable.
22488 SDValue LHS = Op.getOperand(0);
22489 SDValue RHS = Op.getOperand(1);
22490 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22491 return Op;
22492
22493 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22494 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22495 if (IsFP && !Subtarget.hasSSE3())
22496 return Op;
22497 if (!IsFP && !Subtarget.hasSSSE3())
22498 return Op;
22499
22500 // Extract from a common vector.
22501 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22502 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22503 LHS.getOperand(0) != RHS.getOperand(0) ||
22504 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22505 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22506 !shouldUseHorizontalOp(true, DAG, Subtarget))
22507 return Op;
22508
22509 // Allow commuted 'hadd' ops.
22510 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22511 unsigned HOpcode;
22512 switch (Op.getOpcode()) {
22513 // clang-format off
22514 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22515 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22516 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22517 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22518 default:
22519 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22520 // clang-format on
22521 }
22522 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22523 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22524 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22525 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22526 std::swap(LExtIndex, RExtIndex);
22527
22528 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22529 return Op;
22530
22531 SDValue X = LHS.getOperand(0);
22532 EVT VecVT = X.getValueType();
22533 unsigned BitWidth = VecVT.getSizeInBits();
22534 unsigned NumLanes = BitWidth / 128;
22535 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22536 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22537 "Not expecting illegal vector widths here");
22538
22539 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22540 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22541 if (BitWidth == 256 || BitWidth == 512) {
22542 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22543 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22544 LExtIndex %= NumEltsPerLane;
22545 }
22546
22547 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22548 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22549 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22550 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22551 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22552 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22553 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22554}
22555
22556/// Depending on uarch and/or optimizing for size, we might prefer to use a
22557/// vector operation in place of the typical scalar operation.
22558SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22559 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22560 "Only expecting float/double");
22561 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22562}
22563
22564/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22565/// This mode isn't supported in hardware on X86. But as long as we aren't
22566/// compiling with trapping math, we can emulate this with
22567/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22569 SDValue N0 = Op.getOperand(0);
22570 SDLoc dl(Op);
22571 MVT VT = Op.getSimpleValueType();
22572
22573 // N0 += copysign(nextafter(0.5, 0.0), N0)
22574 const fltSemantics &Sem = VT.getFltSemantics();
22575 bool Ignored;
22576 APFloat Point5Pred = APFloat(0.5f);
22577 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22578 Point5Pred.next(/*nextDown*/true);
22579
22580 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22581 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22582 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22583
22584 // Truncate the result to remove fraction.
22585 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22586}
22587
22588/// The only differences between FABS and FNEG are the mask and the logic op.
22589/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22591 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22592 "Wrong opcode for lowering FABS or FNEG.");
22593
22594 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22595
22596 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22597 // into an FNABS. We'll lower the FABS after that if it is still in use.
22598 if (IsFABS)
22599 for (SDNode *User : Op->users())
22600 if (User->getOpcode() == ISD::FNEG)
22601 return Op;
22602
22603 SDLoc dl(Op);
22604 MVT VT = Op.getSimpleValueType();
22605
22606 bool IsF128 = (VT == MVT::f128);
22607 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22609 "Unexpected type in LowerFABSorFNEG");
22610
22611 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22612 // decide if we should generate a 16-byte constant mask when we only need 4 or
22613 // 8 bytes for the scalar case.
22614
22615 // There are no scalar bitwise logical SSE/AVX instructions, so we
22616 // generate a 16-byte vector constant and logic op even for the scalar case.
22617 // Using a 16-byte mask allows folding the load of the mask with
22618 // the logic op, so it can save (~4 bytes) on code size.
22619 bool IsFakeVector = !VT.isVector() && !IsF128;
22620 MVT LogicVT = VT;
22621 if (IsFakeVector)
22622 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22623 : (VT == MVT::f32) ? MVT::v4f32
22624 : MVT::v8f16;
22625
22626 unsigned EltBits = VT.getScalarSizeInBits();
22627 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22628 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22629 APInt::getSignMask(EltBits);
22630 const fltSemantics &Sem = VT.getFltSemantics();
22631 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22632
22633 SDValue Op0 = Op.getOperand(0);
22634 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22635 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22636 IsFNABS ? X86ISD::FOR :
22638 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22639
22640 if (VT.isVector() || IsF128)
22641 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22642
22643 // For the scalar case extend to a 128-bit vector, perform the logic op,
22644 // and extract the scalar result back out.
22645 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22646 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22647 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22648 DAG.getVectorIdxConstant(0, dl));
22649}
22650
22652 SDValue Mag = Op.getOperand(0);
22653 SDValue Sign = Op.getOperand(1);
22654 SDLoc dl(Op);
22655
22656 // If the sign operand is smaller, extend it first.
22657 MVT VT = Op.getSimpleValueType();
22658 if (Sign.getSimpleValueType().bitsLT(VT))
22659 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22660
22661 // And if it is bigger, shrink it first.
22662 if (Sign.getSimpleValueType().bitsGT(VT))
22663 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22664 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22665
22666 // At this point the operands and the result should have the same
22667 // type, and that won't be f80 since that is not custom lowered.
22668 bool IsF128 = (VT == MVT::f128);
22669 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22671 "Unexpected type in LowerFCOPYSIGN");
22672
22673 const fltSemantics &Sem = VT.getFltSemantics();
22674
22675 // Perform all scalar logic operations as 16-byte vectors because there are no
22676 // scalar FP logic instructions in SSE.
22677 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22678 // unnecessary splats, but we might miss load folding opportunities. Should
22679 // this decision be based on OptimizeForSize?
22680 bool IsFakeVector = !VT.isVector() && !IsF128;
22681 MVT LogicVT = VT;
22682 if (IsFakeVector)
22683 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22684 : (VT == MVT::f32) ? MVT::v4f32
22685 : MVT::v8f16;
22686
22687 // The mask constants are automatically splatted for vector types.
22688 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22689 SDValue SignMask = DAG.getConstantFP(
22690 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22691 SDValue MagMask = DAG.getConstantFP(
22692 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22693
22694 // First, clear all bits but the sign bit from the second operand (sign).
22695 if (IsFakeVector)
22696 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22697 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22698
22699 // Next, clear the sign bit from the first operand (magnitude).
22700 // TODO: If we had general constant folding for FP logic ops, this check
22701 // wouldn't be necessary.
22702 SDValue MagBits;
22703 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22704 APFloat APF = Op0CN->getValueAPF();
22705 APF.clearSign();
22706 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22707 } else {
22708 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22709 if (IsFakeVector)
22710 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22711 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22712 }
22713
22714 // OR the magnitude value with the sign bit.
22715 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22716 return !IsFakeVector ? Or
22717 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22718 DAG.getVectorIdxConstant(0, dl));
22719}
22720
22722 SDValue N0 = Op.getOperand(0);
22723 SDLoc dl(Op);
22724 MVT VT = Op.getSimpleValueType();
22725
22726 MVT OpVT = N0.getSimpleValueType();
22727 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22728 "Unexpected type for FGETSIGN");
22729
22730 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22731 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22732 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22733 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22734 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22735 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22736 return Res;
22737}
22738
22739/// Helper for attempting to create a X86ISD::BT node.
22740static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22741 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22742 // instruction. Since the shift amount is in-range-or-undefined, we know
22743 // that doing a bittest on the i32 value is ok. We extend to i32 because
22744 // the encoding for the i16 version is larger than the i32 version.
22745 // Also promote i16 to i32 for performance / code size reason.
22746 if (Src.getValueType().getScalarSizeInBits() < 32)
22747 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22748
22749 // No legal type found, give up.
22750 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22751 return SDValue();
22752
22753 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22754 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22755 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22756 // known to be zero.
22757 if (Src.getValueType() == MVT::i64 &&
22758 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22759 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22760
22761 // If the operand types disagree, extend the shift amount to match. Since
22762 // BT ignores high bits (like shifts) we can use anyextend.
22763 if (Src.getValueType() != BitNo.getValueType()) {
22764 // Peek through a mask/modulo operation.
22765 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22766 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22767 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22768 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22769 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22770 BitNo.getOperand(0)),
22771 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22772 BitNo.getOperand(1)));
22773 else
22774 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22775 }
22776
22777 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22778}
22779
22780/// Helper for creating a X86ISD::SETCC node.
22782 SelectionDAG &DAG) {
22783 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22784 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22785}
22786
22787/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22788/// recognizable memcmp expansion.
22789static bool isOrXorXorTree(SDValue X, bool Root = true) {
22790 if (X.getOpcode() == ISD::OR)
22791 return isOrXorXorTree(X.getOperand(0), false) &&
22792 isOrXorXorTree(X.getOperand(1), false);
22793 if (Root)
22794 return false;
22795 return X.getOpcode() == ISD::XOR;
22796}
22797
22798/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22799/// expansion.
22800template <typename F>
22802 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22803 SDValue Op0 = X.getOperand(0);
22804 SDValue Op1 = X.getOperand(1);
22805 if (X.getOpcode() == ISD::OR) {
22806 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22807 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22808 if (VecVT != CmpVT)
22809 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22810 if (HasPT)
22811 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22812 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22813 }
22814 if (X.getOpcode() == ISD::XOR) {
22815 SDValue A = SToV(Op0);
22816 SDValue B = SToV(Op1);
22817 if (VecVT != CmpVT)
22818 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22819 if (HasPT)
22820 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22821 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22822 }
22823 llvm_unreachable("Impossible");
22824}
22825
22826/// Try to map a 128-bit or larger integer comparison to vector instructions
22827/// before type legalization splits it up into chunks.
22829 ISD::CondCode CC,
22830 const SDLoc &DL,
22831 SelectionDAG &DAG,
22832 const X86Subtarget &Subtarget) {
22833 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22834
22835 // We're looking for an oversized integer equality comparison.
22836 EVT OpVT = X.getValueType();
22837 unsigned OpSize = OpVT.getSizeInBits();
22838 if (!OpVT.isScalarInteger() || OpSize < 128)
22839 return SDValue();
22840
22841 // Ignore a comparison with zero because that gets special treatment in
22842 // EmitTest(). But make an exception for the special case of a pair of
22843 // logically-combined vector-sized operands compared to zero. This pattern may
22844 // be generated by the memcmp expansion pass with oversized integer compares
22845 // (see PR33325).
22846 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22847 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22848 return SDValue();
22849
22850 // Don't perform this combine if constructing the vector will be expensive.
22851 auto IsVectorBitCastCheap = [](SDValue X) {
22853 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22854 X.getOpcode() == ISD::LOAD;
22855 };
22856 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22857 !IsOrXorXorTreeCCZero)
22858 return SDValue();
22859
22860 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22861 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22862 // Otherwise use PCMPEQ (plus AND) and mask testing.
22863 bool NoImplicitFloatOps =
22865 Attribute::NoImplicitFloat);
22866 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22867 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22868 (OpSize == 256 && Subtarget.hasAVX()) ||
22869 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22870 bool HasPT = Subtarget.hasSSE41();
22871
22872 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22873 // vector registers are essentially free. (Technically, widening registers
22874 // prevents load folding, but the tradeoff is worth it.)
22875 bool PreferKOT = Subtarget.preferMaskRegisters();
22876 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22877
22878 EVT VecVT = MVT::v16i8;
22879 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22880 if (OpSize == 256) {
22881 VecVT = MVT::v32i8;
22882 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22883 }
22884 EVT CastVT = VecVT;
22885 bool NeedsAVX512FCast = false;
22886 if (OpSize == 512 || NeedZExt) {
22887 if (Subtarget.hasBWI()) {
22888 VecVT = MVT::v64i8;
22889 CmpVT = MVT::v64i1;
22890 if (OpSize == 512)
22891 CastVT = VecVT;
22892 } else {
22893 VecVT = MVT::v16i32;
22894 CmpVT = MVT::v16i1;
22895 CastVT = OpSize == 512 ? VecVT
22896 : OpSize == 256 ? MVT::v8i32
22897 : MVT::v4i32;
22898 NeedsAVX512FCast = true;
22899 }
22900 }
22901
22902 auto ScalarToVector = [&](SDValue X) -> SDValue {
22903 bool TmpZext = false;
22904 EVT TmpCastVT = CastVT;
22905 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22906 SDValue OrigX = X.getOperand(0);
22907 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22908 if (OrigSize < OpSize) {
22909 if (OrigSize == 128) {
22910 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22911 X = OrigX;
22912 TmpZext = true;
22913 } else if (OrigSize == 256) {
22914 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22915 X = OrigX;
22916 TmpZext = true;
22917 }
22918 }
22919 }
22920 X = DAG.getBitcast(TmpCastVT, X);
22921 if (!NeedZExt && !TmpZext)
22922 return X;
22923 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22924 DAG.getConstant(0, DL, VecVT), X,
22925 DAG.getVectorIdxConstant(0, DL));
22926 };
22927
22928 SDValue Cmp;
22929 if (IsOrXorXorTreeCCZero) {
22930 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22931 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22932 // Use 2 vector equality compares and 'and' the results before doing a
22933 // MOVMSK.
22934 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22935 } else {
22936 SDValue VecX = ScalarToVector(X);
22937 SDValue VecY = ScalarToVector(Y);
22938 if (VecVT != CmpVT) {
22939 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22940 } else if (HasPT) {
22941 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22942 } else {
22943 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22944 }
22945 }
22946 // AVX512 should emit a setcc that will lower to kortest.
22947 if (VecVT != CmpVT) {
22948 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22949 : CmpVT == MVT::v32i1 ? MVT::i32
22950 : MVT::i16;
22951 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22952 DAG.getConstant(0, DL, KRegVT), CC);
22953 }
22954 if (HasPT) {
22955 SDValue BCCmp =
22956 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22957 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22959 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22960 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22961 }
22962 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22963 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22964 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22965 assert(Cmp.getValueType() == MVT::v16i8 &&
22966 "Non 128-bit vector on pre-SSE41 target");
22967 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22968 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22969 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22970 }
22971
22972 return SDValue();
22973}
22974
22975/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22976/// style scalarized (associative) reduction patterns. Partial reductions
22977/// are supported when the pointer SrcMask is non-null.
22978/// TODO - move this to SelectionDAG?
22981 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22983 DenseMap<SDValue, APInt> SrcOpMap;
22984 EVT VT = MVT::Other;
22985
22986 // Recognize a special case where a vector is casted into wide integer to
22987 // test all 0s.
22988 assert(Op.getOpcode() == unsigned(BinOp) &&
22989 "Unexpected bit reduction opcode");
22990 Opnds.push_back(Op.getOperand(0));
22991 Opnds.push_back(Op.getOperand(1));
22992
22993 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22995 // BFS traverse all BinOp operands.
22996 if (I->getOpcode() == unsigned(BinOp)) {
22997 Opnds.push_back(I->getOperand(0));
22998 Opnds.push_back(I->getOperand(1));
22999 // Re-evaluate the number of nodes to be traversed.
23000 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23001 continue;
23002 }
23003
23004 // Quit if a non-EXTRACT_VECTOR_ELT
23005 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23006 return false;
23007
23008 // Quit if without a constant index.
23009 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23010 if (!Idx)
23011 return false;
23012
23013 SDValue Src = I->getOperand(0);
23014 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23015 if (M == SrcOpMap.end()) {
23016 VT = Src.getValueType();
23017 // Quit if not the same type.
23018 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23019 return false;
23020 unsigned NumElts = VT.getVectorNumElements();
23021 APInt EltCount = APInt::getZero(NumElts);
23022 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23023 SrcOps.push_back(Src);
23024 }
23025
23026 // Quit if element already used.
23027 unsigned CIdx = Idx->getZExtValue();
23028 if (M->second[CIdx])
23029 return false;
23030 M->second.setBit(CIdx);
23031 }
23032
23033 if (SrcMask) {
23034 // Collect the source partial masks.
23035 for (SDValue &SrcOp : SrcOps)
23036 SrcMask->push_back(SrcOpMap[SrcOp]);
23037 } else {
23038 // Quit if not all elements are used.
23039 for (const auto &I : SrcOpMap)
23040 if (!I.second.isAllOnes())
23041 return false;
23042 }
23043
23044 return true;
23045}
23046
23047// Helper function for comparing all bits of two vectors.
23049 ISD::CondCode CC, const APInt &OriginalMask,
23050 const X86Subtarget &Subtarget,
23051 SelectionDAG &DAG, X86::CondCode &X86CC) {
23052 EVT VT = LHS.getValueType();
23053 unsigned ScalarSize = VT.getScalarSizeInBits();
23054 if (OriginalMask.getBitWidth() != ScalarSize) {
23055 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23056 return SDValue();
23057 }
23058
23059 // Quit if not convertable to legal scalar or 128/256-bit vector.
23061 return SDValue();
23062
23063 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23064 if (VT.isFloatingPoint())
23065 return SDValue();
23066
23067 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23068 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23069
23070 APInt Mask = OriginalMask;
23071
23072 auto MaskBits = [&](SDValue Src) {
23073 if (Mask.isAllOnes())
23074 return Src;
23075 EVT SrcVT = Src.getValueType();
23076 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23077 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23078 };
23079
23080 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23081 if (VT.getSizeInBits() < 128) {
23082 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23083 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23084 if (IntVT != MVT::i64)
23085 return SDValue();
23086 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23087 MVT::i32, MVT::i32);
23088 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23089 MVT::i32, MVT::i32);
23090 SDValue Lo =
23091 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23092 SDValue Hi =
23093 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23094 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23095 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23096 DAG.getConstant(0, DL, MVT::i32));
23097 }
23098 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23099 DAG.getBitcast(IntVT, MaskBits(LHS)),
23100 DAG.getBitcast(IntVT, MaskBits(RHS)));
23101 }
23102
23103 // Without PTEST, a masked v2i64 or-reduction is not faster than
23104 // scalarization.
23105 bool UseKORTEST = Subtarget.useAVX512Regs();
23106 bool UsePTEST = Subtarget.hasSSE41();
23107 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23108 return SDValue();
23109
23110 // Split down to 128/256/512-bit vector.
23111 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23112
23113 // If the input vector has vector elements wider than the target test size,
23114 // then cast to <X x i64> so it will safely split.
23115 if (ScalarSize > TestSize) {
23116 if (!Mask.isAllOnes())
23117 return SDValue();
23118 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23119 LHS = DAG.getBitcast(VT, LHS);
23120 RHS = DAG.getBitcast(VT, RHS);
23121 Mask = APInt::getAllOnes(64);
23122 }
23123
23124 if (VT.getSizeInBits() > TestSize) {
23125 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23126 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23127 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23128 while (VT.getSizeInBits() > TestSize) {
23129 auto Split = DAG.SplitVector(LHS, DL);
23130 VT = Split.first.getValueType();
23131 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23132 }
23133 RHS = DAG.getAllOnesConstant(DL, VT);
23134 } else if (!UsePTEST && !KnownRHS.isZero()) {
23135 // MOVMSK Special Case:
23136 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23137 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23138 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23139 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23140 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23141 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23142 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23143 V = DAG.getSExtOrTrunc(V, DL, VT);
23144 while (VT.getSizeInBits() > TestSize) {
23145 auto Split = DAG.SplitVector(V, DL);
23146 VT = Split.first.getValueType();
23147 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23148 }
23149 V = DAG.getNOT(DL, V, VT);
23150 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23151 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23152 DAG.getConstant(0, DL, MVT::i32));
23153 } else {
23154 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23155 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23156 while (VT.getSizeInBits() > TestSize) {
23157 auto Split = DAG.SplitVector(V, DL);
23158 VT = Split.first.getValueType();
23159 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23160 }
23161 LHS = V;
23162 RHS = DAG.getConstant(0, DL, VT);
23163 }
23164 }
23165
23166 if (UseKORTEST && VT.is512BitVector()) {
23167 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23168 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23169 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23170 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23171 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23172 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23173 }
23174
23175 if (UsePTEST) {
23176 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23177 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23178 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23179 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23180 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23181 }
23182
23183 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23184 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23185 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23186 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23187 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23188 V = DAG.getNOT(DL, V, MaskVT);
23189 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23190 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23191 DAG.getConstant(0, DL, MVT::i32));
23192}
23193
23194// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23195// to CMP(MOVMSK(PCMPEQB(X,Y))).
23197 ISD::CondCode CC, const SDLoc &DL,
23198 const X86Subtarget &Subtarget,
23199 SelectionDAG &DAG,
23200 X86::CondCode &X86CC) {
23201 SDValue Op = OrigLHS;
23202
23203 bool CmpNull;
23204 APInt Mask;
23205 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23206 CmpNull = isNullConstant(OrigRHS);
23207 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23208 return SDValue();
23209
23210 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23211 return SDValue();
23212
23213 // Check whether we're masking/truncating an OR-reduction result, in which
23214 // case track the masked bits.
23215 // TODO: Add CmpAllOnes support.
23216 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23217 if (CmpNull) {
23218 switch (Op.getOpcode()) {
23219 case ISD::TRUNCATE: {
23220 SDValue Src = Op.getOperand(0);
23221 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23222 Op.getScalarValueSizeInBits());
23223 Op = Src;
23224 break;
23225 }
23226 case ISD::AND: {
23227 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23228 Mask = Cst->getAPIntValue();
23229 Op = Op.getOperand(0);
23230 }
23231 break;
23232 }
23233 }
23234 }
23235 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23236 CC = ISD::SETEQ;
23237 CmpNull = true;
23238 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23239 } else {
23240 return SDValue();
23241 }
23242
23243 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23244
23245 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23246 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23248 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23249 EVT VT = VecIns[0].getValueType();
23250 assert(llvm::all_of(VecIns,
23251 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23252 "Reduction source vector mismatch");
23253
23254 // Quit if not splittable to scalar/128/256/512-bit vector.
23256 return SDValue();
23257
23258 // If more than one full vector is evaluated, AND/OR them first before
23259 // PTEST.
23260 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23261 Slot += 2, e += 1) {
23262 // Each iteration will AND/OR 2 nodes and append the result until there is
23263 // only 1 node left, i.e. the final value of all vectors.
23264 SDValue LHS = VecIns[Slot];
23265 SDValue RHS = VecIns[Slot + 1];
23266 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23267 }
23268
23269 return LowerVectorAllEqual(DL, VecIns.back(),
23270 CmpNull ? DAG.getConstant(0, DL, VT)
23271 : DAG.getAllOnesConstant(DL, VT),
23272 CC, Mask, Subtarget, DAG, X86CC);
23273 }
23274
23275 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23276 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23277 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23278 ISD::NodeType BinOp;
23279 if (SDValue Match =
23280 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23281 EVT MatchVT = Match.getValueType();
23282 return LowerVectorAllEqual(DL, Match,
23283 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23284 : DAG.getAllOnesConstant(DL, MatchVT),
23285 CC, Mask, Subtarget, DAG, X86CC);
23286 }
23287 }
23288
23289 if (Mask.isAllOnes()) {
23290 assert(!Op.getValueType().isVector() &&
23291 "Illegal vector type for reduction pattern");
23293 if (Src.getValueType().isFixedLengthVector() &&
23294 Src.getValueType().getScalarType() == MVT::i1) {
23295 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23296 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23297 if (Src.getOpcode() == ISD::SETCC) {
23298 SDValue LHS = Src.getOperand(0);
23299 SDValue RHS = Src.getOperand(1);
23300 EVT LHSVT = LHS.getValueType();
23301 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23302 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23304 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23305 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23306 X86CC);
23307 }
23308 }
23309 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23310 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23311 // Peek through truncation, mask the LSB and compare against zero/LSB.
23312 if (Src.getOpcode() == ISD::TRUNCATE) {
23313 SDValue Inner = Src.getOperand(0);
23314 EVT InnerVT = Inner.getValueType();
23316 unsigned BW = InnerVT.getScalarSizeInBits();
23317 APInt SrcMask = APInt(BW, 1);
23318 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23319 return LowerVectorAllEqual(DL, Inner,
23320 DAG.getConstant(Cmp, DL, InnerVT), CC,
23321 SrcMask, Subtarget, DAG, X86CC);
23322 }
23323 }
23324 }
23325 }
23326
23327 return SDValue();
23328}
23329
23330/// return true if \c Op has a use that doesn't just read flags.
23332 for (SDUse &Use : Op->uses()) {
23333 SDNode *User = Use.getUser();
23334 unsigned UOpNo = Use.getOperandNo();
23335 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23336 // Look past truncate.
23337 UOpNo = User->use_begin()->getOperandNo();
23338 User = User->use_begin()->getUser();
23339 }
23340
23341 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23342 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23343 return true;
23344 }
23345 return false;
23346}
23347
23348// Transform to an x86-specific ALU node with flags if there is a chance of
23349// using an RMW op or only the flags are used. Otherwise, leave
23350// the node alone and emit a 'cmp' or 'test' instruction.
23352 for (SDNode *U : Op->users())
23353 if (U->getOpcode() != ISD::CopyToReg &&
23354 U->getOpcode() != ISD::SETCC &&
23355 U->getOpcode() != ISD::STORE)
23356 return false;
23357
23358 return true;
23359}
23360
23361/// Emit nodes that will be selected as "test Op0,Op0", or something
23362/// equivalent.
23364 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23365 // CF and OF aren't always set the way we want. Determine which
23366 // of these we need.
23367 bool NeedCF = false;
23368 bool NeedOF = false;
23369 switch (X86CC) {
23370 default: break;
23371 case X86::COND_A: case X86::COND_AE:
23372 case X86::COND_B: case X86::COND_BE:
23373 NeedCF = true;
23374 break;
23375 case X86::COND_G: case X86::COND_GE:
23376 case X86::COND_L: case X86::COND_LE:
23377 case X86::COND_O: case X86::COND_NO: {
23378 // Check if we really need to set the
23379 // Overflow flag. If NoSignedWrap is present
23380 // that is not actually needed.
23381 switch (Op->getOpcode()) {
23382 case ISD::ADD:
23383 case ISD::SUB:
23384 case ISD::MUL:
23385 case ISD::SHL:
23386 if (Op.getNode()->getFlags().hasNoSignedWrap())
23387 break;
23388 [[fallthrough]];
23389 default:
23390 NeedOF = true;
23391 break;
23392 }
23393 break;
23394 }
23395 }
23396 // See if we can use the EFLAGS value from the operand instead of
23397 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23398 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23399 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23400 // Emit a CMP with 0, which is the TEST pattern.
23401 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23402 DAG.getConstant(0, dl, Op.getValueType()));
23403 }
23404 unsigned Opcode = 0;
23405 unsigned NumOperands = 0;
23406
23407 SDValue ArithOp = Op;
23408
23409 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23410 // which may be the result of a CAST. We use the variable 'Op', which is the
23411 // non-casted variable when we check for possible users.
23412 switch (ArithOp.getOpcode()) {
23413 case ISD::AND:
23414 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23415 // because a TEST instruction will be better.
23416 if (!hasNonFlagsUse(Op))
23417 break;
23418
23419 [[fallthrough]];
23420 case ISD::ADD:
23421 case ISD::SUB:
23422 case ISD::OR:
23423 case ISD::XOR:
23425 break;
23426
23427 // Otherwise use a regular EFLAGS-setting instruction.
23428 switch (ArithOp.getOpcode()) {
23429 // clang-format off
23430 default: llvm_unreachable("unexpected operator!");
23431 case ISD::ADD: Opcode = X86ISD::ADD; break;
23432 case ISD::SUB: Opcode = X86ISD::SUB; break;
23433 case ISD::XOR: Opcode = X86ISD::XOR; break;
23434 case ISD::AND: Opcode = X86ISD::AND; break;
23435 case ISD::OR: Opcode = X86ISD::OR; break;
23436 // clang-format on
23437 }
23438
23439 NumOperands = 2;
23440 break;
23441 case X86ISD::ADD:
23442 case X86ISD::SUB:
23443 case X86ISD::OR:
23444 case X86ISD::XOR:
23445 case X86ISD::AND:
23446 return SDValue(Op.getNode(), 1);
23447 case ISD::SSUBO:
23448 case ISD::USUBO: {
23449 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23450 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23451 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23452 Op->getOperand(1)).getValue(1);
23453 }
23454 default:
23455 break;
23456 }
23457
23458 if (Opcode == 0) {
23459 // Emit a CMP with 0, which is the TEST pattern.
23460 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23461 DAG.getConstant(0, dl, Op.getValueType()));
23462 }
23463 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23464 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23465
23466 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23467 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23468 return SDValue(New.getNode(), 1);
23469}
23470
23471/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23472/// equivalent.
23474 const SDLoc &dl, SelectionDAG &DAG,
23475 const X86Subtarget &Subtarget) {
23476 if (isNullConstant(Op1))
23477 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23478
23479 EVT CmpVT = Op0.getValueType();
23480
23481 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23482 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23483
23484 // Only promote the compare up to I32 if it is a 16 bit operation
23485 // with an immediate. 16 bit immediates are to be avoided unless the target
23486 // isn't slowed down by length changing prefixes, we're optimizing for
23487 // codesize or the comparison is with a folded load.
23488 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23489 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23491 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23492 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23493 // Don't do this if the immediate can fit in 8-bits.
23494 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23495 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23496 unsigned ExtendOp =
23498 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23499 // For equality comparisons try to use SIGN_EXTEND if the input was
23500 // truncate from something with enough sign bits.
23501 if (Op0.getOpcode() == ISD::TRUNCATE) {
23502 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23503 ExtendOp = ISD::SIGN_EXTEND;
23504 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23505 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23506 ExtendOp = ISD::SIGN_EXTEND;
23507 }
23508 }
23509
23510 CmpVT = MVT::i32;
23511 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23512 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23513 }
23514 }
23515
23516 // Try to shrink i64 compares if the input has enough zero bits.
23517 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23518 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23519 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23520 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23521 CmpVT = MVT::i32;
23522 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23523 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23524 }
23525
23526 // Try to shrink all i64 compares if the inputs are representable as signed
23527 // i32.
23528 if (CmpVT == MVT::i64 &&
23529 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23530 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23531 CmpVT = MVT::i32;
23532 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23533 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23534 }
23535
23536 // 0-x == y --> x+y == 0
23537 // 0-x != y --> x+y != 0
23538 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23539 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23540 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23541 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23542 return Add.getValue(1);
23543 }
23544
23545 // x == 0-y --> x+y == 0
23546 // x != 0-y --> x+y != 0
23547 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23548 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23549 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23550 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23551 return Add.getValue(1);
23552 }
23553
23554 // If we already have an XOR of the ops, use that to check for equality.
23555 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23556 unsigned X86Opc = X86ISD::SUB;
23557 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23558 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23559 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23560 X86Opc = X86ISD::XOR;
23561
23562 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23563 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23564 return CmpOp.getValue(1);
23565}
23566
23571
23572bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23573 SDNode *N, SDValue, SDValue IntPow2) const {
23574 if (N->getOpcode() == ISD::FDIV)
23575 return true;
23576
23577 EVT FPVT = N->getValueType(0);
23578 EVT IntVT = IntPow2.getValueType();
23579
23580 // This indicates a non-free bitcast.
23581 // TODO: This is probably overly conservative as we will need to scale the
23582 // integer vector anyways for the int->fp cast.
23583 if (FPVT.isVector() &&
23584 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23585 return false;
23586
23587 return true;
23588}
23589
23590/// Check if replacement of SQRT with RSQRT should be disabled.
23591bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23592 EVT VT = Op.getValueType();
23593
23594 // We don't need to replace SQRT with RSQRT for half type.
23595 if (VT.getScalarType() == MVT::f16)
23596 return true;
23597
23598 // We never want to use both SQRT and RSQRT instructions for the same input.
23599 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23600 return false;
23601
23602 if (VT.isVector())
23603 return Subtarget.hasFastVectorFSQRT();
23604 return Subtarget.hasFastScalarFSQRT();
23605}
23606
23607/// The minimum architected relative accuracy is 2^-12. We need one
23608/// Newton-Raphson step to have a good float result (24 bits of precision).
23609SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23610 SelectionDAG &DAG, int Enabled,
23611 int &RefinementSteps,
23612 bool &UseOneConstNR,
23613 bool Reciprocal) const {
23614 SDLoc DL(Op);
23615 EVT VT = Op.getValueType();
23616
23617 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23618 // It is likely not profitable to do this for f64 because a double-precision
23619 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23620 // instructions: convert to single, rsqrtss, convert back to double, refine
23621 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23622 // along with FMA, this could be a throughput win.
23623 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23624 // after legalize types.
23625 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23626 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23627 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23628 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23629 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23630 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23631 RefinementSteps = 1;
23632
23633 UseOneConstNR = false;
23634 // There is no FSQRT for 512-bits, but there is RSQRT14.
23635 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23636 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23637 if (RefinementSteps == 0 && !Reciprocal)
23638 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23639 return Estimate;
23640 }
23641
23642 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23643 Subtarget.hasFP16()) {
23644 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23645 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23646 RefinementSteps = 0;
23647
23648 if (VT == MVT::f16) {
23650 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23651 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23652 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23653 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23654 }
23655
23656 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23657 }
23658 return SDValue();
23659}
23660
23661/// The minimum architected relative accuracy is 2^-12. We need one
23662/// Newton-Raphson step to have a good float result (24 bits of precision).
23663SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23664 int Enabled,
23665 int &RefinementSteps) const {
23666 SDLoc DL(Op);
23667 EVT VT = Op.getValueType();
23668
23669 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23670 // It is likely not profitable to do this for f64 because a double-precision
23671 // reciprocal estimate with refinement on x86 prior to FMA requires
23672 // 15 instructions: convert to single, rcpss, convert back to double, refine
23673 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23674 // along with FMA, this could be a throughput win.
23675
23676 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23677 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23678 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23679 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23680 // Enable estimate codegen with 1 refinement step for vector division.
23681 // Scalar division estimates are disabled because they break too much
23682 // real-world code. These defaults are intended to match GCC behavior.
23683 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23684 return SDValue();
23685
23686 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23687 RefinementSteps = 1;
23688
23689 // There is no FSQRT for 512-bits, but there is RCP14.
23690 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23691 return DAG.getNode(Opcode, DL, VT, Op);
23692 }
23693
23694 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23695 Subtarget.hasFP16()) {
23696 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23697 RefinementSteps = 0;
23698
23699 if (VT == MVT::f16) {
23701 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23702 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23703 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23704 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23705 }
23706
23707 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23708 }
23709 return SDValue();
23710}
23711
23712/// If we have at least two divisions that use the same divisor, convert to
23713/// multiplication by a reciprocal. This may need to be adjusted for a given
23714/// CPU if a division's cost is not at least twice the cost of a multiplication.
23715/// This is because we still need one division to calculate the reciprocal and
23716/// then we need two multiplies by that reciprocal as replacements for the
23717/// original divisions.
23719 return 2;
23720}
23721
23722SDValue
23723X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23724 SelectionDAG &DAG,
23725 SmallVectorImpl<SDNode *> &Created) const {
23726 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23727 if (isIntDivCheap(N->getValueType(0), Attr))
23728 return SDValue(N,0); // Lower SDIV as SDIV
23729
23730 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23731 "Unexpected divisor!");
23732
23733 // Only perform this transform if CMOV is supported otherwise the select
23734 // below will become a branch.
23735 if (!Subtarget.canUseCMOV())
23736 return SDValue();
23737
23738 // fold (sdiv X, pow2)
23739 EVT VT = N->getValueType(0);
23740 // FIXME: Support i8.
23741 if (VT != MVT::i16 && VT != MVT::i32 &&
23742 !(Subtarget.is64Bit() && VT == MVT::i64))
23743 return SDValue();
23744
23745 // If the divisor is 2 or -2, the default expansion is better.
23746 if (Divisor == 2 ||
23747 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23748 return SDValue();
23749
23750 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23751}
23752
23753/// Result of 'and' is compared against zero. Change to a BT node if possible.
23754/// Returns the BT node and the condition code needed to use it.
23756 SelectionDAG &DAG, X86::CondCode &X86CC) {
23757 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23758 SDValue Op0 = And.getOperand(0);
23759 SDValue Op1 = And.getOperand(1);
23760 if (Op0.getOpcode() == ISD::TRUNCATE)
23761 Op0 = Op0.getOperand(0);
23762 if (Op1.getOpcode() == ISD::TRUNCATE)
23763 Op1 = Op1.getOperand(0);
23764
23765 SDValue Src, BitNo;
23766 if (Op1.getOpcode() == ISD::SHL)
23767 std::swap(Op0, Op1);
23768 if (Op0.getOpcode() == ISD::SHL) {
23769 if (isOneConstant(Op0.getOperand(0))) {
23770 // If we looked past a truncate, check that it's only truncating away
23771 // known zeros.
23772 unsigned BitWidth = Op0.getValueSizeInBits();
23773 unsigned AndBitWidth = And.getValueSizeInBits();
23774 if (BitWidth > AndBitWidth) {
23775 KnownBits Known = DAG.computeKnownBits(Op0);
23776 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23777 return SDValue();
23778 }
23779 Src = Op1;
23780 BitNo = Op0.getOperand(1);
23781 }
23782 } else if (Op1.getOpcode() == ISD::Constant) {
23783 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23784 uint64_t AndRHSVal = AndRHS->getZExtValue();
23785 SDValue AndLHS = Op0;
23786
23787 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23788 Src = AndLHS.getOperand(0);
23789 BitNo = AndLHS.getOperand(1);
23790 } else {
23791 // Use BT if the immediate can't be encoded in a TEST instruction or we
23792 // are optimizing for size and the immedaite won't fit in a byte.
23793 bool OptForSize = DAG.shouldOptForSize();
23794 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23795 isPowerOf2_64(AndRHSVal)) {
23796 Src = AndLHS;
23797 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23798 Src.getValueType());
23799 }
23800 }
23801 }
23802
23803 // No patterns found, give up.
23804 if (!Src.getNode())
23805 return SDValue();
23806
23807 // Remove any bit flip.
23808 if (isBitwiseNot(Src)) {
23809 Src = Src.getOperand(0);
23810 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23811 }
23812
23813 // Attempt to create the X86ISD::BT node.
23814 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23815 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23816 return BT;
23817 }
23818
23819 return SDValue();
23820}
23821
23822// Check if pre-AVX condcode can be performed by a single FCMP op.
23823static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23824 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23825}
23826
23827/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23828/// CMPs.
23829static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23830 SDValue &Op1, bool &IsAlwaysSignaling) {
23831 unsigned SSECC;
23832 bool Swap = false;
23833
23834 // SSE Condition code mapping:
23835 // 0 - EQ
23836 // 1 - LT
23837 // 2 - LE
23838 // 3 - UNORD
23839 // 4 - NEQ
23840 // 5 - NLT
23841 // 6 - NLE
23842 // 7 - ORD
23843 switch (SetCCOpcode) {
23844 // clang-format off
23845 default: llvm_unreachable("Unexpected SETCC condition");
23846 case ISD::SETOEQ:
23847 case ISD::SETEQ: SSECC = 0; break;
23848 case ISD::SETOGT:
23849 case ISD::SETGT: Swap = true; [[fallthrough]];
23850 case ISD::SETLT:
23851 case ISD::SETOLT: SSECC = 1; break;
23852 case ISD::SETOGE:
23853 case ISD::SETGE: Swap = true; [[fallthrough]];
23854 case ISD::SETLE:
23855 case ISD::SETOLE: SSECC = 2; break;
23856 case ISD::SETUO: SSECC = 3; break;
23857 case ISD::SETUNE:
23858 case ISD::SETNE: SSECC = 4; break;
23859 case ISD::SETULE: Swap = true; [[fallthrough]];
23860 case ISD::SETUGE: SSECC = 5; break;
23861 case ISD::SETULT: Swap = true; [[fallthrough]];
23862 case ISD::SETUGT: SSECC = 6; break;
23863 case ISD::SETO: SSECC = 7; break;
23864 case ISD::SETUEQ: SSECC = 8; break;
23865 case ISD::SETONE: SSECC = 12; break;
23866 // clang-format on
23867 }
23868 if (Swap)
23869 std::swap(Op0, Op1);
23870
23871 switch (SetCCOpcode) {
23872 default:
23873 IsAlwaysSignaling = true;
23874 break;
23875 case ISD::SETEQ:
23876 case ISD::SETOEQ:
23877 case ISD::SETUEQ:
23878 case ISD::SETNE:
23879 case ISD::SETONE:
23880 case ISD::SETUNE:
23881 case ISD::SETO:
23882 case ISD::SETUO:
23883 IsAlwaysSignaling = false;
23884 break;
23885 }
23886
23887 return SSECC;
23888}
23889
23890/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23891/// concatenate the result back.
23893 SelectionDAG &DAG, const SDLoc &dl) {
23894 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23895 "Unsupported VTs!");
23896 SDValue CC = DAG.getCondCode(Cond);
23897
23898 // Extract the LHS Lo/Hi vectors
23899 SDValue LHS1, LHS2;
23900 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23901
23902 // Extract the RHS Lo/Hi vectors
23903 SDValue RHS1, RHS2;
23904 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23905
23906 // Issue the operation on the smaller types and concatenate the result back
23907 EVT LoVT, HiVT;
23908 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23909 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23910 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23911 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23912}
23913
23915 SelectionDAG &DAG) {
23916 SDValue Op0 = Op.getOperand(0);
23917 SDValue Op1 = Op.getOperand(1);
23918 SDValue CC = Op.getOperand(2);
23919 MVT VT = Op.getSimpleValueType();
23920 assert(VT.getVectorElementType() == MVT::i1 &&
23921 "Cannot set masked compare for this operation");
23922
23923 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23924
23925 // Prefer SETGT over SETLT.
23926 if (SetCCOpcode == ISD::SETLT) {
23927 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23928 std::swap(Op0, Op1);
23929 }
23930
23931 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23932}
23933
23934/// Given a buildvector constant, return a new vector constant with each element
23935/// incremented or decremented. If incrementing or decrementing would result in
23936/// unsigned overflow or underflow or this is not a simple vector constant,
23937/// return an empty value.
23939 bool NSW) {
23940 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23941 if (!BV || !V.getValueType().isSimple())
23942 return SDValue();
23943
23944 MVT VT = V.getSimpleValueType();
23945 MVT EltVT = VT.getVectorElementType();
23946 unsigned NumElts = VT.getVectorNumElements();
23948 SDLoc DL(V);
23949 for (unsigned i = 0; i < NumElts; ++i) {
23950 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23951 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23952 return SDValue();
23953
23954 // Avoid overflow/underflow.
23955 const APInt &EltC = Elt->getAPIntValue();
23956 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23957 return SDValue();
23958 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23959 (!IsInc && EltC.isMinSignedValue())))
23960 return SDValue();
23961
23962 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23963 }
23964
23965 return DAG.getBuildVector(VT, DL, NewVecC);
23966}
23967
23968/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23969/// Op0 u<= Op1:
23970/// t = psubus Op0, Op1
23971/// pcmpeq t, <0..0>
23973 ISD::CondCode Cond, const SDLoc &dl,
23974 const X86Subtarget &Subtarget,
23975 SelectionDAG &DAG) {
23976 if (!Subtarget.hasSSE2())
23977 return SDValue();
23978
23979 MVT VET = VT.getVectorElementType();
23980 if (VET != MVT::i8 && VET != MVT::i16)
23981 return SDValue();
23982
23983 switch (Cond) {
23984 default:
23985 return SDValue();
23986 case ISD::SETULT: {
23987 // If the comparison is against a constant we can turn this into a
23988 // setule. With psubus, setule does not require a swap. This is
23989 // beneficial because the constant in the register is no longer
23990 // destructed as the destination so it can be hoisted out of a loop.
23991 // Only do this pre-AVX since vpcmp* is no longer destructive.
23992 if (Subtarget.hasAVX())
23993 return SDValue();
23994 SDValue ULEOp1 =
23995 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23996 if (!ULEOp1)
23997 return SDValue();
23998 Op1 = ULEOp1;
23999 break;
24000 }
24001 case ISD::SETUGT: {
24002 // If the comparison is against a constant, we can turn this into a setuge.
24003 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24004 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24005 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24006 SDValue UGEOp1 =
24007 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24008 if (!UGEOp1)
24009 return SDValue();
24010 Op1 = Op0;
24011 Op0 = UGEOp1;
24012 break;
24013 }
24014 // Psubus is better than flip-sign because it requires no inversion.
24015 case ISD::SETUGE:
24016 std::swap(Op0, Op1);
24017 break;
24018 case ISD::SETULE:
24019 break;
24020 }
24021
24022 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24023 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24024 DAG.getConstant(0, dl, VT));
24025}
24026
24027static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24028 SelectionDAG &DAG) {
24029 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24030 Op.getOpcode() == ISD::STRICT_FSETCCS;
24031 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24032 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24033 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24034 MVT VT = Op->getSimpleValueType(0);
24036 MVT OpVT = Op0.getSimpleValueType();
24037 SDLoc dl(Op);
24038
24039 if (OpVT.isFloatingPoint()) {
24040 MVT EltVT = OpVT.getVectorElementType();
24041 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24042 EltVT == MVT::f64);
24043
24044 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24045 if (isSoftF16(EltVT, Subtarget)) {
24046 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24047 return SDValue();
24048
24049 // Break 256-bit FP vector compare into smaller ones.
24050 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24051 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24052
24053 // Break 512-bit FP vector compare into smaller ones.
24054 if (OpVT.is512BitVector())
24055 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24056
24057 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24058 if (IsStrict) {
24059 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24060 {Chain, Op0});
24061 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24062 {Chain, Op1});
24063 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24064 {Chain, Op0, Op1, CC});
24065 }
24066 MVT DVT = VT.getVectorElementType() == MVT::i16
24067 ? VT.changeVectorElementType(MVT::i32)
24068 : VT;
24069 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24070 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24071 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24072 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24073 }
24074
24075 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24076
24077 // If we have a strict compare with a vXi1 result and the input is 128/256
24078 // bits we can't use a masked compare unless we have VLX. If we use a wider
24079 // compare like we do for non-strict, we might trigger spurious exceptions
24080 // from the upper elements. Instead emit a AVX compare and convert to mask.
24081 unsigned Opc;
24082 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24083 (!IsStrict || Subtarget.hasVLX() ||
24085#ifndef NDEBUG
24086 unsigned Num = VT.getVectorNumElements();
24087 assert(Num <= 16 ||
24088 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24089#endif
24090 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24091 } else {
24092 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24093 // The SSE/AVX packed FP comparison nodes are defined with a
24094 // floating-point vector result that matches the operand type. This allows
24095 // them to work with an SSE1 target (integer vector types are not legal).
24096 VT = Op0.getSimpleValueType();
24097 }
24098
24099 SDValue Cmp;
24100 bool IsAlwaysSignaling;
24101 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24102 if (!Subtarget.hasAVX()) {
24103 // TODO: We could use following steps to handle a quiet compare with
24104 // signaling encodings.
24105 // 1. Get ordered masks from a quiet ISD::SETO
24106 // 2. Use the masks to mask potential unordered elements in operand A, B
24107 // 3. Get the compare results of masked A, B
24108 // 4. Calculating final result using the mask and result from 3
24109 // But currently, we just fall back to scalar operations.
24110 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24111 return SDValue();
24112
24113 // Insert an extra signaling instruction to raise exception.
24114 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24115 SDValue SignalCmp = DAG.getNode(
24116 Opc, dl, {VT, MVT::Other},
24117 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24118 // FIXME: It seems we need to update the flags of all new strict nodes.
24119 // Otherwise, mayRaiseFPException in MI will return false due to
24120 // NoFPExcept = false by default. However, I didn't find it in other
24121 // patches.
24122 SignalCmp->setFlags(Op->getFlags());
24123 Chain = SignalCmp.getValue(1);
24124 }
24125
24126 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24127 // emit two comparisons and a logic op to tie them together.
24128 if (!cheapX86FSETCC_SSE(Cond)) {
24129 // LLVM predicate is SETUEQ or SETONE.
24130 unsigned CC0, CC1;
24131 unsigned CombineOpc;
24132 if (Cond == ISD::SETUEQ) {
24133 CC0 = 3; // UNORD
24134 CC1 = 0; // EQ
24135 CombineOpc = X86ISD::FOR;
24136 } else {
24138 CC0 = 7; // ORD
24139 CC1 = 4; // NEQ
24140 CombineOpc = X86ISD::FAND;
24141 }
24142
24143 SDValue Cmp0, Cmp1;
24144 if (IsStrict) {
24145 Cmp0 = DAG.getNode(
24146 Opc, dl, {VT, MVT::Other},
24147 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24148 Cmp1 = DAG.getNode(
24149 Opc, dl, {VT, MVT::Other},
24150 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24151 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24152 Cmp1.getValue(1));
24153 } else {
24154 Cmp0 = DAG.getNode(
24155 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24156 Cmp1 = DAG.getNode(
24157 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24158 }
24159 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24160 } else {
24161 if (IsStrict) {
24162 Cmp = DAG.getNode(
24163 Opc, dl, {VT, MVT::Other},
24164 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24165 Chain = Cmp.getValue(1);
24166 } else
24167 Cmp = DAG.getNode(
24168 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24169 }
24170 } else {
24171 // Handle all other FP comparisons here.
24172 if (IsStrict) {
24173 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24174 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24175 Cmp = DAG.getNode(
24176 Opc, dl, {VT, MVT::Other},
24177 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24178 Chain = Cmp.getValue(1);
24179 } else
24180 Cmp = DAG.getNode(
24181 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24182 }
24183
24184 if (VT.getFixedSizeInBits() >
24185 Op.getSimpleValueType().getFixedSizeInBits()) {
24186 // We emitted a compare with an XMM/YMM result. Finish converting to a
24187 // mask register using a vptestm.
24189 Cmp = DAG.getBitcast(CastVT, Cmp);
24190 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24191 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24192 } else {
24193 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24194 // the result type of SETCC. The bitcast is expected to be optimized
24195 // away during combining/isel.
24196 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24197 }
24198
24199 if (IsStrict)
24200 return DAG.getMergeValues({Cmp, Chain}, dl);
24201
24202 return Cmp;
24203 }
24204
24205 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24206
24207 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24208 assert(VTOp0 == Op1.getSimpleValueType() &&
24209 "Expected operands with same type!");
24211 "Invalid number of packed elements for source and destination!");
24212
24213 // The non-AVX512 code below works under the assumption that source and
24214 // destination types are the same.
24215 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24216 "Value types for source and destination must be the same!");
24217
24218 // The result is boolean, but operands are int/float
24219 if (VT.getVectorElementType() == MVT::i1) {
24220 // In AVX-512 architecture setcc returns mask with i1 elements,
24221 // But there is no compare instruction for i8 and i16 elements in KNL.
24222 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24223 "Unexpected operand type");
24224 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24225 }
24226
24227 // Lower using XOP integer comparisons.
24228 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24229 // Translate compare code to XOP PCOM compare mode.
24230 unsigned CmpMode = 0;
24231 switch (Cond) {
24232 // clang-format off
24233 default: llvm_unreachable("Unexpected SETCC condition");
24234 case ISD::SETULT:
24235 case ISD::SETLT: CmpMode = 0x00; break;
24236 case ISD::SETULE:
24237 case ISD::SETLE: CmpMode = 0x01; break;
24238 case ISD::SETUGT:
24239 case ISD::SETGT: CmpMode = 0x02; break;
24240 case ISD::SETUGE:
24241 case ISD::SETGE: CmpMode = 0x03; break;
24242 case ISD::SETEQ: CmpMode = 0x04; break;
24243 case ISD::SETNE: CmpMode = 0x05; break;
24244 // clang-format on
24245 }
24246
24247 // Are we comparing unsigned or signed integers?
24248 unsigned Opc =
24250
24251 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24252 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24253 }
24254
24255 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24256 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24258 SDValue BC0 = peekThroughBitcasts(Op0);
24259 if (BC0.getOpcode() == ISD::AND &&
24261 /*AllowUndefs=*/false)) {
24262 Cond = ISD::SETEQ;
24263 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24264 }
24265 }
24266
24267 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24268 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24269 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24271 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24272 unsigned BitWidth = VT.getScalarSizeInBits();
24273 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24274
24275 SDValue Result = Op0.getOperand(0);
24276 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24277 DAG.getConstant(ShiftAmt, dl, VT));
24278 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24279 DAG.getConstant(BitWidth - 1, dl, VT));
24280 return Result;
24281 }
24282 }
24283
24284 // Break 256-bit integer vector compare into smaller ones.
24285 if (VT.is256BitVector() && !Subtarget.hasInt256())
24286 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24287
24288 // Break 512-bit integer vector compare into smaller ones.
24289 // TODO: Try harder to use VPCMPx + VPMOV2x?
24290 if (VT.is512BitVector())
24291 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24292
24293 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24294 // not-of-PCMPEQ:
24295 // X != INT_MIN --> X >s INT_MIN
24296 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24297 // +X != 0 --> +X >s 0
24298 APInt ConstValue;
24299 if (Cond == ISD::SETNE &&
24300 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24301 if (ConstValue.isMinSignedValue())
24302 Cond = ISD::SETGT;
24303 else if (ConstValue.isMaxSignedValue())
24304 Cond = ISD::SETLT;
24305 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24306 Cond = ISD::SETGT;
24307 }
24308
24309 // If both operands are known non-negative, then an unsigned compare is the
24310 // same as a signed compare and there's no need to flip signbits.
24311 // TODO: We could check for more general simplifications here since we're
24312 // computing known bits.
24313 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24314 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24315
24316 // Special case: Use min/max operations for unsigned compares.
24317 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24319 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24320 TLI.isOperationLegal(ISD::UMIN, VT)) {
24321 // If we have a constant operand, increment/decrement it and change the
24322 // condition to avoid an invert.
24323 if (Cond == ISD::SETUGT) {
24324 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24325 if (SDValue UGTOp1 =
24326 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24327 Op1 = UGTOp1;
24328 Cond = ISD::SETUGE;
24329 }
24330 }
24331 if (Cond == ISD::SETULT) {
24332 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24333 if (SDValue ULTOp1 =
24334 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24335 Op1 = ULTOp1;
24336 Cond = ISD::SETULE;
24337 }
24338 }
24339 bool Invert = false;
24340 unsigned Opc;
24341 switch (Cond) {
24342 // clang-format off
24343 default: llvm_unreachable("Unexpected condition code");
24344 case ISD::SETUGT: Invert = true; [[fallthrough]];
24345 case ISD::SETULE: Opc = ISD::UMIN; break;
24346 case ISD::SETULT: Invert = true; [[fallthrough]];
24347 case ISD::SETUGE: Opc = ISD::UMAX; break;
24348 // clang-format on
24349 }
24350
24351 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24352 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24353
24354 // If the logical-not of the result is required, perform that now.
24355 if (Invert)
24356 Result = DAG.getNOT(dl, Result, VT);
24357
24358 return Result;
24359 }
24360
24361 // Try to use SUBUS and PCMPEQ.
24362 if (FlipSigns)
24363 if (SDValue V =
24364 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24365 return V;
24366
24367 // We are handling one of the integer comparisons here. Since SSE only has
24368 // GT and EQ comparisons for integer, swapping operands and multiple
24369 // operations may be required for some comparisons.
24370 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24372 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24374 bool Invert = Cond == ISD::SETNE ||
24376
24377 if (Swap)
24378 std::swap(Op0, Op1);
24379
24380 // Check that the operation in question is available (most are plain SSE2,
24381 // but PCMPGTQ and PCMPEQQ have different requirements).
24382 if (VT == MVT::v2i64) {
24383 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24384 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24385
24386 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24387 // the odd elements over the even elements.
24388 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24389 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24390 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24391
24392 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24393 static const int MaskHi[] = { 1, 1, 3, 3 };
24394 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24395
24396 return DAG.getBitcast(VT, Result);
24397 }
24398
24399 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24400 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24401 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24402
24403 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24404 static const int MaskHi[] = { 1, 1, 3, 3 };
24405 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24406
24407 return DAG.getBitcast(VT, Result);
24408 }
24409
24410 // If the i64 elements are sign-extended enough to be representable as i32
24411 // then we can compare the lower i32 bits and splat.
24412 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24413 DAG.ComputeNumSignBits(Op1) > 32) {
24414 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24415 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24416
24417 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24418 static const int MaskLo[] = {0, 0, 2, 2};
24419 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24420
24421 return DAG.getBitcast(VT, Result);
24422 }
24423
24424 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24425 // bits of the inputs before performing those operations. The lower
24426 // compare is always unsigned.
24427 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24428 : 0x0000000080000000ULL,
24429 dl, MVT::v2i64);
24430
24431 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24432 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24433
24434 // Cast everything to the right type.
24435 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24436 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24437
24438 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24439 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24440 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24441
24442 // Create masks for only the low parts/high parts of the 64 bit integers.
24443 static const int MaskHi[] = { 1, 1, 3, 3 };
24444 static const int MaskLo[] = { 0, 0, 2, 2 };
24445 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24446 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24447 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24448
24449 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24450 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24451
24452 if (Invert)
24453 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24454
24455 return DAG.getBitcast(VT, Result);
24456 }
24457
24458 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24459 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24460 // pcmpeqd + pshufd + pand.
24461 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24462
24463 // First cast everything to the right type.
24464 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24465 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24466
24467 // Do the compare.
24468 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24469
24470 // Make sure the lower and upper halves are both all-ones.
24471 static const int Mask[] = { 1, 0, 3, 2 };
24472 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24473 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24474
24475 if (Invert)
24476 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24477
24478 return DAG.getBitcast(VT, Result);
24479 }
24480 }
24481
24482 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24483 // bits of the inputs before performing those operations.
24484 if (FlipSigns) {
24485 MVT EltVT = VT.getVectorElementType();
24487 VT);
24488 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24489 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24490 }
24491
24492 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24493
24494 // If the logical-not of the result is required, perform that now.
24495 if (Invert)
24496 Result = DAG.getNOT(dl, Result, VT);
24497
24498 return Result;
24499}
24500
24501// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24503 const SDLoc &dl, SelectionDAG &DAG,
24504 const X86Subtarget &Subtarget,
24505 SDValue &X86CC) {
24506 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24507
24508 // Must be a bitcast from vXi1.
24509 if (Op0.getOpcode() != ISD::BITCAST)
24510 return SDValue();
24511
24512 Op0 = Op0.getOperand(0);
24513 MVT VT = Op0.getSimpleValueType();
24514 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24515 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24516 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24517 return SDValue();
24518
24519 X86::CondCode X86Cond;
24520 if (isNullConstant(Op1)) {
24521 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24522 } else if (isAllOnesConstant(Op1)) {
24523 // C flag is set for all ones.
24524 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24525 } else
24526 return SDValue();
24527
24528 // If the input is an AND, we can combine it's operands into the KTEST.
24529 bool KTestable = false;
24530 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24531 KTestable = true;
24532 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24533 KTestable = true;
24534 if (!isNullConstant(Op1))
24535 KTestable = false;
24536 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24537 SDValue LHS = Op0.getOperand(0);
24538 SDValue RHS = Op0.getOperand(1);
24539 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24540 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24541 }
24542
24543 // If the input is an OR, we can combine it's operands into the KORTEST.
24544 SDValue LHS = Op0;
24545 SDValue RHS = Op0;
24546 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24547 LHS = Op0.getOperand(0);
24548 RHS = Op0.getOperand(1);
24549 }
24550
24551 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24552 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24553}
24554
24555/// Emit flags for the given setcc condition and operands. Also returns the
24556/// corresponding X86 condition code constant in X86CC.
24557SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24558 ISD::CondCode CC, const SDLoc &dl,
24559 SelectionDAG &DAG,
24560 SDValue &X86CC) const {
24561 // Equality Combines.
24562 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24563 X86::CondCode X86CondCode;
24564
24565 // Optimize to BT if possible.
24566 // Lower (X & (1 << N)) == 0 to BT(X, N).
24567 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24568 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24569 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24570 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24571 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24572 return BT;
24573 }
24574 }
24575
24576 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24577 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24578 X86CondCode)) {
24579 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24580 return CmpZ;
24581 }
24582
24583 // Try to lower using KORTEST or KTEST.
24584 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24585 return Test;
24586
24587 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24588 // of these.
24589 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24590 // If the input is a setcc, then reuse the input setcc or use a new one
24591 // with the inverted condition.
24592 if (Op0.getOpcode() == X86ISD::SETCC) {
24593 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24594
24595 X86CC = Op0.getOperand(0);
24596 if (Invert) {
24597 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24598 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24599 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24600 }
24601
24602 return Op0.getOperand(1);
24603 }
24604 }
24605
24606 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24607 // overflow.
24608 if (isMinSignedConstant(Op1)) {
24609 EVT VT = Op0.getValueType();
24610 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24611 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24613 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24614 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24615 DAG.getConstant(0, dl, VT), Op0);
24616 return SDValue(Neg.getNode(), 1);
24617 }
24618 }
24619
24620 // Try to use the carry flag from the add in place of an separate CMP for:
24621 // (seteq (add X, -1), -1). Similar for setne.
24622 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24623 Op0.getOperand(1) == Op1) {
24624 if (isProfitableToUseFlagOp(Op0)) {
24625 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24626
24627 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24628 Op0.getOperand(1));
24629 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24630 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24631 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24632 return SDValue(New.getNode(), 1);
24633 }
24634 }
24635 }
24636
24638 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24639 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24640
24641 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24642 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24643 return EFLAGS;
24644}
24645
24646SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24647
24648 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24649 Op.getOpcode() == ISD::STRICT_FSETCCS;
24650 MVT VT = Op->getSimpleValueType(0);
24651
24652 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24653
24654 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24655 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24656 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24657 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24658 SDLoc dl(Op);
24659 ISD::CondCode CC =
24660 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24661
24662 if (isSoftF16(Op0.getValueType(), Subtarget))
24663 return SDValue();
24664
24665 // Handle f128 first, since one possible outcome is a normal integer
24666 // comparison which gets handled by emitFlagsForSetcc.
24667 if (Op0.getValueType() == MVT::f128) {
24668 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24669 Op.getOpcode() == ISD::STRICT_FSETCCS);
24670
24671 // If softenSetCCOperands returned a scalar, use it.
24672 if (!Op1.getNode()) {
24673 assert(Op0.getValueType() == Op.getValueType() &&
24674 "Unexpected setcc expansion!");
24675 if (IsStrict)
24676 return DAG.getMergeValues({Op0, Chain}, dl);
24677 return Op0;
24678 }
24679 }
24680
24681 if (Op0.getSimpleValueType().isInteger()) {
24682 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24683 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24684 // this may translate to less uops depending on uarch implementation. The
24685 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24686 // canonicalize to that CondCode.
24687 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24688 // encoding size - so it must either already be a i8 or i32 immediate, or it
24689 // shrinks down to that. We don't do this for any i64's to avoid additional
24690 // constant materializations.
24691 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24692 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24693 const APInt &Op1Val = Op1C->getAPIntValue();
24694 if (!Op1Val.isZero()) {
24695 // Ensure the constant+1 doesn't overflow.
24696 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24697 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24698 APInt Op1ValPlusOne = Op1Val + 1;
24699 if (Op1ValPlusOne.isSignedIntN(32) &&
24700 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24701 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24704 }
24705 }
24706 }
24707 }
24708
24709 SDValue X86CC;
24710 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24711 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24712 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24713 }
24714
24715 if (Subtarget.hasAVX10_2()) {
24716 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24717 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24718 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24719 if (Op0.getSimpleValueType() != MVT::f80) {
24720 SDValue Res = getSETCC(
24721 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24722 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24723 }
24724 }
24725 }
24726 // Handle floating point.
24727 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24728 if (CondCode == X86::COND_INVALID)
24729 return SDValue();
24730
24731 SDValue EFLAGS;
24732 if (IsStrict) {
24733 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24734 EFLAGS =
24736 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24737 Chain = EFLAGS.getValue(1);
24738 } else {
24739 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24740 }
24741
24742 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24743 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24744 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24745}
24746
24747SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24748 SDValue LHS = Op.getOperand(0);
24749 SDValue RHS = Op.getOperand(1);
24750 SDValue Carry = Op.getOperand(2);
24751 SDValue Cond = Op.getOperand(3);
24752 SDLoc DL(Op);
24753
24754 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24756
24757 // Recreate the carry if needed.
24758 EVT CarryVT = Carry.getValueType();
24759 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24760 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24761
24762 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24763 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24764 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24765}
24766
24767// This function returns three things: the arithmetic computation itself
24768// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24769// flag and the condition code define the case in which the arithmetic
24770// computation overflows.
24771static std::pair<SDValue, SDValue>
24773 assert(Op.getResNo() == 0 && "Unexpected result number!");
24774 SDValue Value, Overflow;
24775 SDValue LHS = Op.getOperand(0);
24776 SDValue RHS = Op.getOperand(1);
24777 unsigned BaseOp = 0;
24778 SDLoc DL(Op);
24779 switch (Op.getOpcode()) {
24780 default: llvm_unreachable("Unknown ovf instruction!");
24781 case ISD::SADDO:
24782 BaseOp = X86ISD::ADD;
24783 Cond = X86::COND_O;
24784 break;
24785 case ISD::UADDO:
24786 BaseOp = X86ISD::ADD;
24788 break;
24789 case ISD::SSUBO:
24790 BaseOp = X86ISD::SUB;
24791 Cond = X86::COND_O;
24792 break;
24793 case ISD::USUBO:
24794 BaseOp = X86ISD::SUB;
24795 Cond = X86::COND_B;
24796 break;
24797 case ISD::SMULO:
24798 BaseOp = X86ISD::SMUL;
24799 Cond = X86::COND_O;
24800 break;
24801 case ISD::UMULO:
24802 BaseOp = X86ISD::UMUL;
24803 Cond = X86::COND_O;
24804 break;
24805 }
24806
24807 if (BaseOp) {
24808 // Also sets EFLAGS.
24809 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24810 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24811 Overflow = Value.getValue(1);
24812 }
24813
24814 return std::make_pair(Value, Overflow);
24815}
24816
24818 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24819 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24820 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24821 // has only one use.
24822 SDLoc DL(Op);
24824 SDValue Value, Overflow;
24825 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24826
24827 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24828 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24829 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24830}
24831
24832/// Return true if opcode is a X86 logical comparison.
24834 unsigned Opc = Op.getOpcode();
24835 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24836 Opc == X86ISD::FCMP)
24837 return true;
24838 if (Op.getResNo() == 1 &&
24839 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24841 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24842 return true;
24843
24844 return false;
24845}
24846
24848 if (V.getOpcode() != ISD::TRUNCATE)
24849 return false;
24850
24851 SDValue VOp0 = V.getOperand(0);
24852 unsigned InBits = VOp0.getValueSizeInBits();
24853 unsigned Bits = V.getValueSizeInBits();
24854 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24855}
24856
24857// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24859 unsigned X86CC, const SDLoc &DL,
24860 SelectionDAG &DAG,
24861 const X86Subtarget &Subtarget) {
24862 EVT CmpVT = CmpVal.getValueType();
24863 EVT VT = LHS.getValueType();
24864 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24865 return SDValue();
24866
24867 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24868 isOneConstant(CmpVal.getOperand(1))) {
24869 auto SplatLSB = [&](EVT SplatVT) {
24870 // we need mask of all zeros or ones with same size of the other
24871 // operands.
24872 SDValue Neg = CmpVal;
24873 if (CmpVT.bitsGT(SplatVT))
24874 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24875 else if (CmpVT.bitsLT(SplatVT))
24876 Neg = DAG.getNode(
24877 ISD::AND, DL, SplatVT,
24878 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24879 DAG.getConstant(1, DL, SplatVT));
24880 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24881 };
24882
24883 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24885 return SplatLSB(VT);
24886
24887 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24888 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24890 SDValue Mask = SplatLSB(VT);
24891 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24892 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24893 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24894 }
24895
24896 SDValue Src1, Src2;
24897 auto isIdentityPatternZero = [&]() {
24898 switch (RHS.getOpcode()) {
24899 default:
24900 break;
24901 case ISD::OR:
24902 case ISD::XOR:
24903 case ISD::ADD:
24904 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24905 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24906 Src2 = LHS;
24907 return true;
24908 }
24909 break;
24910 case ISD::SHL:
24911 case ISD::SRA:
24912 case ISD::SRL:
24913 case ISD::SUB:
24914 if (RHS.getOperand(0) == LHS) {
24915 Src1 = RHS.getOperand(1);
24916 Src2 = LHS;
24917 return true;
24918 }
24919 break;
24920 }
24921 return false;
24922 };
24923
24924 auto isIdentityPatternOnes = [&]() {
24925 switch (LHS.getOpcode()) {
24926 default:
24927 break;
24928 case ISD::AND:
24929 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24930 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24931 Src2 = RHS;
24932 return true;
24933 }
24934 break;
24935 }
24936 return false;
24937 };
24938
24939 // Convert 'identity' patterns (iff X is 0 or 1):
24940 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24941 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24942 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24943 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24944 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24945 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24946 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24947 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24948 SDValue Mask = SplatLSB(Src1.getValueType());
24949 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24950 Src1); // Mask & z
24951 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24952 }
24953 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24954 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24955 SDValue Mask = SplatLSB(VT);
24956 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24957 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24958 }
24959 }
24960
24961 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24964 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24965
24966 // 'X - 1' sets the carry flag if X == 0.
24967 // '0 - X' sets the carry flag if X != 0.
24968 // Convert the carry flag to a -1/0 mask with sbb:
24969 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24970 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24971 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24972 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24973 SDValue Sub;
24974 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24975 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24976 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24977 } else {
24978 SDValue One = DAG.getConstant(1, DL, CmpVT);
24979 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24980 }
24981 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24982 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24983 Sub.getValue(1));
24984 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24985 }
24986
24987 return SDValue();
24988}
24989
24990SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24991 bool AddTest = true;
24992 SDValue Cond = Op.getOperand(0);
24993 SDValue Op1 = Op.getOperand(1);
24994 SDValue Op2 = Op.getOperand(2);
24995 SDLoc DL(Op);
24996 MVT VT = Op1.getSimpleValueType();
24997 SDValue CC;
24998
24999 if (isSoftF16(VT, Subtarget)) {
25000 MVT NVT = VT.changeTypeToInteger();
25001 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25002 DAG.getBitcast(NVT, Op1),
25003 DAG.getBitcast(NVT, Op2)));
25004 }
25005
25006 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25007 // are available or VBLENDV if AVX is available.
25008 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25009 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25010 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25011 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25012 bool IsAlwaysSignaling;
25013 unsigned SSECC =
25014 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25015 CondOp0, CondOp1, IsAlwaysSignaling);
25016
25017 if (Subtarget.hasAVX512()) {
25018 SDValue Cmp =
25019 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25020 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25021 assert(!VT.isVector() && "Not a scalar type?");
25022 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25023 }
25024
25025 if (SSECC < 8 || Subtarget.hasAVX()) {
25026 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25027 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25028
25029 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25030 // instead of 3 logic instructions for size savings and potentially speed.
25031 // Unfortunately, there is no scalar form of VBLENDV.
25032 //
25033 // If either operand is a +0.0 constant, don't try this. We can expect to
25034 // optimize away at least one of the logic instructions later in that
25035 // case, so that sequence would be faster than a variable blend.
25036 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25037 !isNullFPConstant(Op2)) {
25038 // Convert to vectors, do a VSELECT, and convert back to scalar.
25039 // All of the conversions should be optimized away.
25040 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25041 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25042 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25043 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25044
25045 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25046 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25047
25048 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25049
25050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25051 DAG.getVectorIdxConstant(0, DL));
25052 }
25053 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25054 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25055 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25056 }
25057 }
25058
25059 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25060 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25061 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25062 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25063 }
25064
25065 if (Cond.getOpcode() == ISD::SETCC &&
25066 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25067 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25068 Cond = NewCond;
25069 // If the condition was updated, it's possible that the operands of the
25070 // select were also updated (for example, EmitTest has a RAUW). Refresh
25071 // the local references to the select operands in case they got stale.
25072 Op1 = Op.getOperand(1);
25073 Op2 = Op.getOperand(2);
25074 }
25075 }
25076
25077 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25078 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25079 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25080 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25081 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25082 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25083 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25084 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25085 if (Cond.getOpcode() == X86ISD::SETCC &&
25086 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25087 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25088 SDValue Cmp = Cond.getOperand(1);
25089 SDValue CmpOp0 = Cmp.getOperand(0);
25090 unsigned CondCode = Cond.getConstantOperandVal(0);
25091
25092 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25093 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25094 // handle to keep the CMP with 0. This should be removed by
25095 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25096 // cttz_zero_undef.
25097 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25098 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25099 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25100 };
25101 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25102 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25103 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25104 // Keep Cmp.
25105 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25106 DL, DAG, Subtarget)) {
25107 return R;
25108 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25109 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25110 ((CondCode == X86::COND_S) || // smin(x, 0)
25111 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25112 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25113 //
25114 // If the comparison is testing for a positive value, we have to invert
25115 // the sign bit mask, so only do that transform if the target has a
25116 // bitwise 'and not' instruction (the invert is free).
25117 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25118 unsigned ShCt = VT.getSizeInBits() - 1;
25119 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25120 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25121 if (CondCode == X86::COND_G)
25122 Shift = DAG.getNOT(DL, Shift, VT);
25123 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25124 }
25125 }
25126
25127 // Look past (and (setcc_carry (cmp ...)), 1).
25128 if (Cond.getOpcode() == ISD::AND &&
25129 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25130 isOneConstant(Cond.getOperand(1)))
25131 Cond = Cond.getOperand(0);
25132
25133 // Attempt to fold "raw cond" cases by treating them as:
25134 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25135 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25136 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25137 Subtarget))
25138 return R;
25139
25140 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25141 // setting operand in place of the X86ISD::SETCC.
25142 unsigned CondOpcode = Cond.getOpcode();
25143 if (CondOpcode == X86ISD::SETCC ||
25144 CondOpcode == X86ISD::SETCC_CARRY) {
25145 CC = Cond.getOperand(0);
25146
25147 SDValue Cmp = Cond.getOperand(1);
25148 bool IllegalFPCMov = false;
25149 if (VT.isFloatingPoint() && !VT.isVector() &&
25150 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25151 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25152
25153 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25154 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25155 Cond = Cmp;
25156 AddTest = false;
25157 }
25158 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25159 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25160 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25161 SDValue Value;
25162 X86::CondCode X86Cond;
25163 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25164
25165 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25166 AddTest = false;
25167 }
25168
25169 if (AddTest) {
25170 // Look past the truncate if the high bits are known zero.
25172 Cond = Cond.getOperand(0);
25173
25174 // We know the result of AND is compared against zero. Try to match
25175 // it to BT.
25176 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25177 X86::CondCode X86CondCode;
25178 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25179 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25180 Cond = BT;
25181 AddTest = false;
25182 }
25183 }
25184 }
25185
25186 if (AddTest) {
25187 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25188 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25189 }
25190
25191 // a < b ? -1 : 0 -> RES = ~setcc_carry
25192 // a < b ? 0 : -1 -> RES = setcc_carry
25193 // a >= b ? -1 : 0 -> RES = setcc_carry
25194 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25195 if (Cond.getOpcode() == X86ISD::SUB) {
25196 unsigned CondCode = CC->getAsZExtVal();
25197
25198 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25199 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25200 (isNullConstant(Op1) || isNullConstant(Op2))) {
25201 SDValue Res =
25202 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25203 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25204 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25205 return DAG.getNOT(DL, Res, Res.getValueType());
25206 return Res;
25207 }
25208 }
25209
25210 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25211 // widen the cmov and push the truncate through. This avoids introducing a new
25212 // branch during isel and doesn't add any extensions.
25213 if (Op.getValueType() == MVT::i8 &&
25214 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25215 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25216 if (T1.getValueType() == T2.getValueType() &&
25217 // Exclude CopyFromReg to avoid partial register stalls.
25218 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25219 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25220 CC, Cond);
25221 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25222 }
25223 }
25224
25225 // Or finally, promote i8 cmovs if we have CMOV,
25226 // or i16 cmovs if it won't prevent folding a load.
25227 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25228 // legal, but EmitLoweredSelect() can not deal with these extensions
25229 // being inserted between two CMOV's. (in i16 case too TBN)
25230 // https://bugs.llvm.org/show_bug.cgi?id=40974
25231 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25232 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25233 !X86::mayFoldLoad(Op2, Subtarget))) {
25234 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25235 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25236 SDValue Ops[] = { Op2, Op1, CC, Cond };
25237 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25238 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25239 }
25240
25241 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25242 // condition is true.
25243 SDValue Ops[] = { Op2, Op1, CC, Cond };
25244 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25245}
25246
25248 const X86Subtarget &Subtarget,
25249 SelectionDAG &DAG) {
25250 MVT VT = Op->getSimpleValueType(0);
25251 SDValue In = Op->getOperand(0);
25252 MVT InVT = In.getSimpleValueType();
25253 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25254 MVT VTElt = VT.getVectorElementType();
25255 unsigned NumElts = VT.getVectorNumElements();
25256
25257 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25258 MVT ExtVT = VT;
25259 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25260 // If v16i32 is to be avoided, we'll need to split and concatenate.
25261 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25262 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25263
25264 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25265 }
25266
25267 // Widen to 512-bits if VLX is not supported.
25268 MVT WideVT = ExtVT;
25269 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25270 NumElts *= 512 / ExtVT.getSizeInBits();
25271 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25272 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25273 DAG.getVectorIdxConstant(0, dl));
25274 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25275 }
25276
25277 SDValue V;
25278 MVT WideEltVT = WideVT.getVectorElementType();
25279 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25280 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25281 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25282 } else {
25283 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25284 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25285 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25286 }
25287
25288 // Truncate if we had to extend i16/i8 above.
25289 if (VT != ExtVT) {
25290 WideVT = MVT::getVectorVT(VTElt, NumElts);
25291 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25292 }
25293
25294 // Extract back to 128/256-bit if we widened.
25295 if (WideVT != VT)
25296 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25297 DAG.getVectorIdxConstant(0, dl));
25298
25299 return V;
25300}
25301
25303 SelectionDAG &DAG) {
25304 SDValue In = Op->getOperand(0);
25305 MVT InVT = In.getSimpleValueType();
25306 SDLoc DL(Op);
25307
25308 if (InVT.getVectorElementType() == MVT::i1)
25309 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25310
25311 assert(Subtarget.hasAVX() && "Expected AVX support");
25312 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25313}
25314
25315// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25316// For sign extend this needs to handle all vector sizes and SSE4.1 and
25317// non-SSE4.1 targets. For zero extend this should only handle inputs of
25318// MVT::v64i8 when BWI is not supported, but AVX512 is.
25320 const X86Subtarget &Subtarget,
25321 SelectionDAG &DAG) {
25322 SDValue In = Op->getOperand(0);
25323 MVT VT = Op->getSimpleValueType(0);
25324 MVT InVT = In.getSimpleValueType();
25325
25326 MVT SVT = VT.getVectorElementType();
25327 MVT InSVT = InVT.getVectorElementType();
25329
25330 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25331 return SDValue();
25332 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25333 return SDValue();
25334 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25335 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25336 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25337 return SDValue();
25338
25339 SDLoc dl(Op);
25340 unsigned Opc = Op.getOpcode();
25341 unsigned NumElts = VT.getVectorNumElements();
25342
25343 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25344 // For 512-bit vectors, we need 128-bits or 256-bits.
25345 if (InVT.getSizeInBits() > 128) {
25346 // Input needs to be at least the same number of elements as output, and
25347 // at least 128-bits.
25348 int InSize = InSVT.getSizeInBits() * NumElts;
25349 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25350 InVT = In.getSimpleValueType();
25351 }
25352
25353 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25354 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25355 // need to be handled here for 256/512-bit results.
25356 if (Subtarget.hasInt256()) {
25357 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25358
25359 if (InVT.getVectorNumElements() != NumElts)
25360 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25361
25362 // FIXME: Apparently we create inreg operations that could be regular
25363 // extends.
25364 unsigned ExtOpc =
25367 return DAG.getNode(ExtOpc, dl, VT, In);
25368 }
25369
25370 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25371 if (Subtarget.hasAVX()) {
25372 assert(VT.is256BitVector() && "256-bit vector expected");
25373 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25374 int HalfNumElts = HalfVT.getVectorNumElements();
25375
25376 unsigned NumSrcElts = InVT.getVectorNumElements();
25377 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25378 for (int i = 0; i != HalfNumElts; ++i)
25379 HiMask[i] = HalfNumElts + i;
25380
25381 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25382 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25383 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25384 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25385 }
25386
25387 // We should only get here for sign extend.
25388 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25389 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25390 unsigned InNumElts = InVT.getVectorNumElements();
25391
25392 // If the source elements are already all-signbits, we don't need to extend,
25393 // just splat the elements.
25394 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25395 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25396 unsigned Scale = InNumElts / NumElts;
25397 SmallVector<int, 16> ShuffleMask;
25398 for (unsigned I = 0; I != NumElts; ++I)
25399 ShuffleMask.append(Scale, I);
25400 return DAG.getBitcast(VT,
25401 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25402 }
25403
25404 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25405 SDValue Curr = In;
25406 SDValue SignExt = Curr;
25407
25408 // As SRAI is only available on i16/i32 types, we expand only up to i32
25409 // and handle i64 separately.
25410 if (InVT != MVT::v4i32) {
25411 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25412
25413 unsigned DestWidth = DestVT.getScalarSizeInBits();
25414 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25415 unsigned DestElts = DestVT.getVectorNumElements();
25416
25417 // Build a shuffle mask that takes each input element and places it in the
25418 // MSBs of the new element size.
25419 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25420 for (unsigned i = 0; i != DestElts; ++i)
25421 Mask[i * Scale + (Scale - 1)] = i;
25422
25423 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25424 Curr = DAG.getBitcast(DestVT, Curr);
25425
25426 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25427 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25428 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25429 }
25430
25431 if (VT == MVT::v2i64) {
25432 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25433 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25434 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25435 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25436 SignExt = DAG.getBitcast(VT, SignExt);
25437 }
25438
25439 return SignExt;
25440}
25441
25443 SelectionDAG &DAG) {
25444 MVT VT = Op->getSimpleValueType(0);
25445 SDValue In = Op->getOperand(0);
25446 MVT InVT = In.getSimpleValueType();
25447 SDLoc dl(Op);
25448
25449 if (InVT.getVectorElementType() == MVT::i1)
25450 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25451
25452 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25454 "Expected same number of elements");
25455 assert((VT.getVectorElementType() == MVT::i16 ||
25456 VT.getVectorElementType() == MVT::i32 ||
25457 VT.getVectorElementType() == MVT::i64) &&
25458 "Unexpected element type");
25459 assert((InVT.getVectorElementType() == MVT::i8 ||
25460 InVT.getVectorElementType() == MVT::i16 ||
25461 InVT.getVectorElementType() == MVT::i32) &&
25462 "Unexpected element type");
25463
25464 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25465 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25466 return splitVectorIntUnary(Op, DAG, dl);
25467 }
25468
25469 if (Subtarget.hasInt256())
25470 return Op;
25471
25472 // Optimize vectors in AVX mode
25473 // Sign extend v8i16 to v8i32 and
25474 // v4i32 to v4i64
25475 //
25476 // Divide input vector into two parts
25477 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25478 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25479 // concat the vectors to original VT
25480 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25481 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25482
25483 unsigned NumElems = InVT.getVectorNumElements();
25484 SmallVector<int,8> ShufMask(NumElems, -1);
25485 for (unsigned i = 0; i != NumElems/2; ++i)
25486 ShufMask[i] = i + NumElems/2;
25487
25488 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25489 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25490
25491 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25492}
25493
25494/// Change a vector store into a pair of half-size vector stores.
25496 SDValue StoredVal = Store->getValue();
25497 assert((StoredVal.getValueType().is256BitVector() ||
25498 StoredVal.getValueType().is512BitVector()) &&
25499 "Expecting 256/512-bit op");
25500
25501 // Splitting volatile memory ops is not allowed unless the operation was not
25502 // legal to begin with. Assume the input store is legal (this transform is
25503 // only used for targets with AVX). Note: It is possible that we have an
25504 // illegal type like v2i128, and so we could allow splitting a volatile store
25505 // in that case if that is important.
25506 if (!Store->isSimple())
25507 return SDValue();
25508
25509 SDLoc DL(Store);
25510 SDValue Value0, Value1;
25511 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25512 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25513 SDValue Ptr0 = Store->getBasePtr();
25514 SDValue Ptr1 =
25515 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25516 SDValue Ch0 =
25517 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25518 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25519 SDValue Ch1 =
25520 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25521 Store->getPointerInfo().getWithOffset(HalfOffset),
25522 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25523 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25524}
25525
25526/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25527/// type.
25529 SelectionDAG &DAG) {
25530 SDValue StoredVal = Store->getValue();
25531 assert(StoreVT.is128BitVector() &&
25532 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25533 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25534
25535 // Splitting volatile memory ops is not allowed unless the operation was not
25536 // legal to begin with. We are assuming the input op is legal (this transform
25537 // is only used for targets with AVX).
25538 if (!Store->isSimple())
25539 return SDValue();
25540
25541 MVT StoreSVT = StoreVT.getScalarType();
25542 unsigned NumElems = StoreVT.getVectorNumElements();
25543 unsigned ScalarSize = StoreSVT.getStoreSize();
25544
25545 SDLoc DL(Store);
25547 for (unsigned i = 0; i != NumElems; ++i) {
25548 unsigned Offset = i * ScalarSize;
25549 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25551 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25552 DAG.getVectorIdxConstant(i, DL));
25553 SDValue Ch =
25554 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25555 Store->getPointerInfo().getWithOffset(Offset),
25556 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25557 Stores.push_back(Ch);
25558 }
25559 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25560}
25561
25562static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25563 SelectionDAG &DAG) {
25564 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25565 SDLoc dl(St);
25566 SDValue StoredVal = St->getValue();
25567
25568 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25569 if (StoredVal.getValueType().isVector() &&
25570 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25571 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25572 assert(NumElts <= 8 && "Unexpected VT");
25573 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25574 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25575 "Expected AVX512F without AVX512DQI");
25576
25577 // We must pad with zeros to ensure we store zeroes to any unused bits.
25578 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25579 DAG.getUNDEF(MVT::v16i1), StoredVal,
25580 DAG.getVectorIdxConstant(0, dl));
25581 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25582 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25583 // Make sure we store zeros in the extra bits.
25584 if (NumElts < 8)
25585 StoredVal = DAG.getZeroExtendInReg(
25586 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25587
25588 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25589 St->getPointerInfo(), St->getBaseAlign(),
25590 St->getMemOperand()->getFlags());
25591 }
25592
25593 if (St->isTruncatingStore())
25594 return SDValue();
25595
25596 // If this is a 256/512-bit store of concatenated ops, we are better off
25597 // splitting that store into two half-size stores. This avoids spurious use of
25598 // concatenated ops and each half can execute independently. Some cores would
25599 // split the op into halves anyway, so the concat is purely an extra op.
25600 MVT StoreVT = StoredVal.getSimpleValueType();
25601 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25602 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25603 return splitVectorStore(St, DAG);
25604 return SDValue();
25605 }
25606
25607 if (StoreVT.is32BitVector())
25608 return SDValue();
25609
25610 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25611 assert(StoreVT.is64BitVector() && "Unexpected VT");
25612 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25614 "Unexpected type action!");
25615
25616 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25617 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25618 DAG.getUNDEF(StoreVT));
25619
25620 if (Subtarget.hasSSE2()) {
25621 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25622 // and store it.
25623 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25624 MVT CastVT = MVT::getVectorVT(StVT, 2);
25625 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25626 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25627 DAG.getVectorIdxConstant(0, dl));
25628
25629 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25630 St->getPointerInfo(), St->getBaseAlign(),
25631 St->getMemOperand()->getFlags());
25632 }
25633 assert(Subtarget.hasSSE1() && "Expected SSE");
25634 SDVTList Tys = DAG.getVTList(MVT::Other);
25635 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25636 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25637 St->getMemOperand());
25638}
25639
25640// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25641// may emit an illegal shuffle but the expansion is still better than scalar
25642// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25643// we'll emit a shuffle and a arithmetic shift.
25644// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25645// TODO: It is possible to support ZExt by zeroing the undef values during
25646// the shuffle phase or after the shuffle.
25647static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25648 SelectionDAG &DAG) {
25649 MVT RegVT = Op.getSimpleValueType();
25650 assert(RegVT.isVector() && "We only custom lower vector loads.");
25651 assert(RegVT.isInteger() &&
25652 "We only custom lower integer vector loads.");
25653
25654 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25655 SDLoc dl(Ld);
25656
25657 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25658 if (RegVT.getVectorElementType() == MVT::i1) {
25659 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25660 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25661 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25662 "Expected AVX512F without AVX512DQI");
25663
25664 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25665 Ld->getPointerInfo(), Ld->getBaseAlign(),
25666 Ld->getMemOperand()->getFlags());
25667
25668 // Replace chain users with the new chain.
25669 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25670
25671 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25672 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25673 DAG.getBitcast(MVT::v16i1, Val),
25674 DAG.getVectorIdxConstant(0, dl));
25675 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25676 }
25677
25678 return SDValue();
25679}
25680
25681/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25682/// each of which has no other use apart from the AND / OR.
25683static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25684 Opc = Op.getOpcode();
25685 if (Opc != ISD::OR && Opc != ISD::AND)
25686 return false;
25687 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25688 Op.getOperand(0).hasOneUse() &&
25689 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25690 Op.getOperand(1).hasOneUse());
25691}
25692
25693SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25694 SDValue Chain = Op.getOperand(0);
25695 SDValue Cond = Op.getOperand(1);
25696 SDValue Dest = Op.getOperand(2);
25697 SDLoc dl(Op);
25698
25699 // Bail out when we don't have native compare instructions.
25700 if (Cond.getOpcode() == ISD::SETCC &&
25701 Cond.getOperand(0).getValueType() != MVT::f128 &&
25702 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25703 SDValue LHS = Cond.getOperand(0);
25704 SDValue RHS = Cond.getOperand(1);
25705 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25706
25707 // Special case for
25708 // setcc([su]{add,sub,mul}o == 0)
25709 // setcc([su]{add,sub,mul}o != 1)
25711 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25713 SDValue Value, Overflow;
25714 X86::CondCode X86Cond;
25715 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25716
25717 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25718 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25719
25720 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25721 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25722 Overflow, Op->getFlags());
25723 }
25724
25725 if (LHS.getSimpleValueType().isInteger()) {
25726 SDValue CCVal;
25727 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25728 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25729 EFLAGS, Op->getFlags());
25730 }
25731
25732 if (CC == ISD::SETOEQ) {
25733 // For FCMP_OEQ, we can emit
25734 // two branches instead of an explicit AND instruction with a
25735 // separate test. However, we only do this if this block doesn't
25736 // have a fall-through edge, because this requires an explicit
25737 // jmp when the condition is false.
25738 if (Op.getNode()->hasOneUse()) {
25739 SDNode *User = *Op.getNode()->user_begin();
25740 // Look for an unconditional branch following this conditional branch.
25741 // We need this because we need to reverse the successors in order
25742 // to implement FCMP_OEQ.
25743 if (User->getOpcode() == ISD::BR) {
25744 SDValue FalseBB = User->getOperand(1);
25745 SDNode *NewBR =
25746 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25747 assert(NewBR == User);
25748 (void)NewBR;
25749 Dest = FalseBB;
25750
25751 SDValue Cmp =
25752 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25753 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25754 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25755 CCVal, Cmp, Op->getFlags());
25756 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25757 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25758 Cmp, Op->getFlags());
25759 }
25760 }
25761 } else if (CC == ISD::SETUNE) {
25762 // For FCMP_UNE, we can emit
25763 // two branches instead of an explicit OR instruction with a
25764 // separate test.
25765 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25766 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25767 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25768 Cmp, Op->getFlags());
25769 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25770 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25771 Cmp, Op->getFlags());
25772 } else {
25773 X86::CondCode X86Cond =
25774 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25775 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25776 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25777 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25778 Cmp, Op->getFlags());
25779 }
25780 }
25781
25783 SDValue Value, Overflow;
25784 X86::CondCode X86Cond;
25785 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25786
25787 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25788 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25789 Overflow, Op->getFlags());
25790 }
25791
25792 // Look past the truncate if the high bits are known zero.
25794 Cond = Cond.getOperand(0);
25795
25796 EVT CondVT = Cond.getValueType();
25797
25798 // Add an AND with 1 if we don't already have one.
25799 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25800 Cond =
25801 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25802
25803 SDValue LHS = Cond;
25804 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25805
25806 SDValue CCVal;
25807 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25808 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25809 Op->getFlags());
25810}
25811
25812// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25813// Calls to _alloca are needed to probe the stack when allocating more than 4k
25814// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25815// that the guard pages used by the OS virtual memory manager are allocated in
25816// correct sequence.
25817SDValue
25818X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25819 SelectionDAG &DAG) const {
25820 MachineFunction &MF = DAG.getMachineFunction();
25821 bool SplitStack = MF.shouldSplitStack();
25822 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25823 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25824 SplitStack || EmitStackProbeCall;
25825 SDLoc dl(Op);
25826
25827 // Get the inputs.
25828 SDNode *Node = Op.getNode();
25829 SDValue Chain = Op.getOperand(0);
25830 SDValue Size = Op.getOperand(1);
25831 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25832 EVT VT = Node->getValueType(0);
25833
25834 // Chain the dynamic stack allocation so that it doesn't modify the stack
25835 // pointer when other instructions are using the stack.
25836 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25837
25838 bool Is64Bit = Subtarget.is64Bit();
25839 MVT SPTy = Op.getValueType().getSimpleVT();
25840
25842 if (!Lower) {
25843 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25845 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25846 " not tell us which reg is the stack pointer!");
25847
25848 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25849 const Align StackAlign = TFI.getStackAlign();
25850 if (hasInlineStackProbe(MF)) {
25851 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25852 {Chain, Size});
25853 Chain = Result.getValue(1);
25854 } else {
25855 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25856 Chain = SP.getValue(1);
25857 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25858 }
25859 if (Alignment && *Alignment > StackAlign)
25860 Result = DAG.getNode(
25861 ISD::AND, dl, VT, Result,
25862 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25863 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25864 } else if (SplitStack) {
25865 if (Is64Bit) {
25866 // The 64 bit implementation of segmented stacks needs to clobber both r10
25867 // r11. This makes it impossible to use it along with nested parameters.
25868 const Function &F = MF.getFunction();
25869 for (const auto &A : F.args()) {
25870 if (A.hasNestAttr())
25871 report_fatal_error("Cannot use segmented stacks with functions that "
25872 "have nested arguments.");
25873 }
25874 }
25875
25876 Result =
25877 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25878 Chain = Result.getValue(1);
25879 } else {
25880 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25881 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25882 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25883
25884 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25885 Register SPReg = RegInfo->getStackRegister();
25886 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25887 Chain = SP.getValue(1);
25888
25889 if (Alignment) {
25890 SP = DAG.getNode(
25891 ISD::AND, dl, VT, SP.getValue(0),
25892 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25893 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25894 }
25895
25896 Result = SP;
25897 }
25898
25899 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25900
25901 SDValue Ops[2] = {Result, Chain};
25902 return DAG.getMergeValues(Ops, dl);
25903}
25904
25905SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25906 MachineFunction &MF = DAG.getMachineFunction();
25907 SDValue Ptr = Op.getOperand(1);
25908 EVT PtrVT = Ptr.getValueType();
25909
25910 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25911
25912 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25913 SDLoc DL(Op);
25914
25915 if (!Subtarget.is64Bit() ||
25916 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25917 // vastart just stores the address of the VarArgsFrameIndex slot into the
25918 // memory location argument.
25919 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25920 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25921 }
25922
25923 // __va_list_tag:
25924 // gp_offset (0 - 6 * 8)
25925 // fp_offset (48 - 48 + 8 * 16)
25926 // overflow_arg_area (point to parameters coming in memory).
25927 // reg_save_area
25929 SDValue FIN = Op.getOperand(1);
25930 // Store gp_offset
25931 SDValue Store = DAG.getStore(
25932 Op.getOperand(0), DL,
25933 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25934 MachinePointerInfo(SV));
25935 MemOps.push_back(Store);
25936
25937 // Store fp_offset
25938 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25939 Store = DAG.getStore(
25940 Op.getOperand(0), DL,
25941 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25942 MachinePointerInfo(SV, 4));
25943 MemOps.push_back(Store);
25944
25945 // Store ptr to overflow_arg_area
25946 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25947 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25948 Store =
25949 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25950 MemOps.push_back(Store);
25951
25952 // Store ptr to reg_save_area.
25953 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25954 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25955 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25956 Store = DAG.getStore(
25957 Op.getOperand(0), DL, RSFIN, FIN,
25958 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25959 MemOps.push_back(Store);
25960 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25961}
25962
25963SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25964 assert(Subtarget.is64Bit() &&
25965 "LowerVAARG only handles 64-bit va_arg!");
25966 assert(Op.getNumOperands() == 4);
25967
25968 MachineFunction &MF = DAG.getMachineFunction();
25969 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25970 // The Win64 ABI uses char* instead of a structure.
25971 return DAG.expandVAArg(Op.getNode());
25972
25973 SDValue Chain = Op.getOperand(0);
25974 SDValue SrcPtr = Op.getOperand(1);
25975 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25976 unsigned Align = Op.getConstantOperandVal(3);
25977 SDLoc dl(Op);
25978
25979 EVT ArgVT = Op.getNode()->getValueType(0);
25980 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25981 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25982 uint8_t ArgMode;
25983
25984 // Decide which area this value should be read from.
25985 // TODO: Implement the AMD64 ABI in its entirety. This simple
25986 // selection mechanism works only for the basic types.
25987 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25988 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25989 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25990 } else {
25991 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25992 "Unhandled argument type in LowerVAARG");
25993 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25994 }
25995
25996 if (ArgMode == 2) {
25997 // Make sure using fp_offset makes sense.
25998 assert(!Subtarget.useSoftFloat() &&
25999 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26000 Subtarget.hasSSE1());
26001 }
26002
26003 // Insert VAARG node into the DAG
26004 // VAARG returns two values: Variable Argument Address, Chain
26005 SDValue InstOps[] = {Chain, SrcPtr,
26006 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26007 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26008 DAG.getTargetConstant(Align, dl, MVT::i32)};
26009 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26010 SDValue VAARG = DAG.getMemIntrinsicNode(
26011 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26012 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26013 /*Alignment=*/std::nullopt,
26015 Chain = VAARG.getValue(1);
26016
26017 // Load the next argument and return it
26018 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26019}
26020
26021static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26022 SelectionDAG &DAG) {
26023 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26024 // where a va_list is still an i8*.
26025 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26026 if (Subtarget.isCallingConvWin64(
26028 // Probably a Win64 va_copy.
26029 return DAG.expandVACopy(Op.getNode());
26030
26031 SDValue Chain = Op.getOperand(0);
26032 SDValue DstPtr = Op.getOperand(1);
26033 SDValue SrcPtr = Op.getOperand(2);
26034 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26035 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26036 SDLoc DL(Op);
26037
26038 return DAG.getMemcpy(
26039 Chain, DL, DstPtr, SrcPtr,
26040 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26041 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26042 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26043 MachinePointerInfo(SrcSV));
26044}
26045
26046// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26047static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26048 switch (Opc) {
26049 case ISD::SHL:
26050 case X86ISD::VSHL:
26051 case X86ISD::VSHLI:
26052 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26053 case ISD::SRL:
26054 case X86ISD::VSRL:
26055 case X86ISD::VSRLI:
26056 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26057 case ISD::SRA:
26058 case X86ISD::VSRA:
26059 case X86ISD::VSRAI:
26060 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26061 }
26062 llvm_unreachable("Unknown target vector shift node");
26063}
26064
26065/// Handle vector element shifts where the shift amount is a constant.
26066/// Takes immediate version of shift as input.
26067static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26068 SDValue SrcOp, uint64_t ShiftAmt,
26069 SelectionDAG &DAG) {
26070 MVT ElementType = VT.getVectorElementType();
26071
26072 // Bitcast the source vector to the output type, this is mainly necessary for
26073 // vXi8/vXi64 shifts.
26074 if (VT != SrcOp.getSimpleValueType())
26075 SrcOp = DAG.getBitcast(VT, SrcOp);
26076
26077 // Fold this packed shift into its first operand if ShiftAmt is 0.
26078 if (ShiftAmt == 0)
26079 return SrcOp;
26080
26081 // Check for ShiftAmt >= element width
26082 if (ShiftAmt >= ElementType.getSizeInBits()) {
26083 if (Opc == X86ISD::VSRAI)
26084 ShiftAmt = ElementType.getSizeInBits() - 1;
26085 else
26086 return DAG.getConstant(0, dl, VT);
26087 }
26088
26090 && "Unknown target vector shift-by-constant node");
26091
26092 // Fold this packed vector shift into a build vector if SrcOp is a
26093 // vector of Constants or UNDEFs.
26095 unsigned ShiftOpc;
26096 switch (Opc) {
26097 default: llvm_unreachable("Unknown opcode!");
26098 case X86ISD::VSHLI:
26099 ShiftOpc = ISD::SHL;
26100 break;
26101 case X86ISD::VSRLI:
26102 ShiftOpc = ISD::SRL;
26103 break;
26104 case X86ISD::VSRAI:
26105 ShiftOpc = ISD::SRA;
26106 break;
26107 }
26108
26109 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26110 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26111 return C;
26112 }
26113
26114 return DAG.getNode(Opc, dl, VT, SrcOp,
26115 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26116}
26117
26118/// Handle vector element shifts by a splat shift amount
26119static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26120 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26121 const X86Subtarget &Subtarget,
26122 SelectionDAG &DAG) {
26123 MVT AmtVT = ShAmt.getSimpleValueType();
26124 assert(AmtVT.isVector() && "Vector shift type mismatch");
26125 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26126 "Illegal vector splat index");
26127
26128 // Move the splat element to the bottom element.
26129 if (ShAmtIdx != 0) {
26130 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26131 Mask[0] = ShAmtIdx;
26132 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26133 }
26134
26135 // Peek through any zext node if we can get back to a 128-bit source.
26136 if (AmtVT.getScalarSizeInBits() == 64 &&
26137 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26139 ShAmt.getOperand(0).getValueType().isSimple() &&
26140 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26141 ShAmt = ShAmt.getOperand(0);
26142 AmtVT = ShAmt.getSimpleValueType();
26143 }
26144
26145 // See if we can mask off the upper elements using the existing source node.
26146 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26147 // do this for vXi64 types.
26148 bool IsMasked = false;
26149 if (AmtVT.getScalarSizeInBits() < 64) {
26150 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26151 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26152 // If the shift amount has come from a scalar, then zero-extend the scalar
26153 // before moving to the vector.
26154 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26155 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26156 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26157 AmtVT = MVT::v4i32;
26158 IsMasked = true;
26159 } else if (ShAmt.getOpcode() == ISD::AND) {
26160 // See if the shift amount is already masked (e.g. for rotation modulo),
26161 // then we can zero-extend it by setting all the other mask elements to
26162 // zero.
26163 SmallVector<SDValue> MaskElts(
26164 AmtVT.getVectorNumElements(),
26165 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26166 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26167 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26168 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26169 {ShAmt.getOperand(1), Mask}))) {
26170 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26171 IsMasked = true;
26172 }
26173 }
26174 }
26175
26176 // Extract if the shift amount vector is larger than 128-bits.
26177 if (AmtVT.getSizeInBits() > 128) {
26178 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26179 AmtVT = ShAmt.getSimpleValueType();
26180 }
26181
26182 // Zero-extend bottom element to v2i64 vector type, either by extension or
26183 // shuffle masking.
26184 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26185 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26186 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26187 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26188 } else if (Subtarget.hasSSE41()) {
26189 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26190 MVT::v2i64, ShAmt);
26191 } else {
26192 SDValue ByteShift = DAG.getTargetConstant(
26193 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26194 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26195 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26196 ByteShift);
26197 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26198 ByteShift);
26199 }
26200 }
26201
26202 // Change opcode to non-immediate version.
26204
26205 // The return type has to be a 128-bit type with the same element
26206 // type as the input type.
26207 MVT EltVT = VT.getVectorElementType();
26208 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26209
26210 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26211 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26212}
26213
26214/// Return Mask with the necessary casting or extending
26215/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26216static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26217 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26218 const SDLoc &dl) {
26219
26220 if (isAllOnesConstant(Mask))
26221 return DAG.getConstant(1, dl, MaskVT);
26222 if (X86::isZeroNode(Mask))
26223 return DAG.getConstant(0, dl, MaskVT);
26224
26225 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26226
26227 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26228 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26229 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26230 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26231 SDValue Lo, Hi;
26232 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26233 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26234 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26235 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26236 } else {
26237 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26238 Mask.getSimpleValueType().getSizeInBits());
26239 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26240 // are extracted by EXTRACT_SUBVECTOR.
26241 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26242 DAG.getBitcast(BitcastVT, Mask),
26243 DAG.getVectorIdxConstant(0, dl));
26244 }
26245}
26246
26247/// Return (and \p Op, \p Mask) for compare instructions or
26248/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26249/// necessary casting or extending for \p Mask when lowering masking intrinsics
26251 SDValue PreservedSrc,
26252 const X86Subtarget &Subtarget,
26253 SelectionDAG &DAG) {
26254 MVT VT = Op.getSimpleValueType();
26255 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26256 unsigned OpcodeSelect = ISD::VSELECT;
26257 SDLoc dl(Op);
26258
26259 if (isAllOnesConstant(Mask))
26260 return Op;
26261
26262 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26263
26264 if (PreservedSrc.isUndef())
26265 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26266 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26267}
26268
26269/// Creates an SDNode for a predicated scalar operation.
26270/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26271/// The mask is coming as MVT::i8 and it should be transformed
26272/// to MVT::v1i1 while lowering masking intrinsics.
26273/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26274/// "X86select" instead of "vselect". We just can't create the "vselect" node
26275/// for a scalar instruction.
26277 SDValue PreservedSrc,
26278 const X86Subtarget &Subtarget,
26279 SelectionDAG &DAG) {
26280 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26281 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26282 return Op;
26283
26284 MVT VT = Op.getSimpleValueType();
26285 SDLoc dl(Op);
26286
26287 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26288 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26289 DAG.getBitcast(MVT::v8i1, Mask),
26290 DAG.getVectorIdxConstant(0, dl));
26291 if (Op.getOpcode() == X86ISD::FSETCCM ||
26292 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26293 Op.getOpcode() == X86ISD::VFPCLASSS)
26294 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26295
26296 if (PreservedSrc.isUndef())
26297 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26298
26299 if (MaskConst) {
26300 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26301 // Discard op and blend passthrough with scalar op src/dst.
26303 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26304 ShuffleMask[0] = VT.getVectorNumElements();
26305 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26306 ShuffleMask);
26307 }
26308
26309 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26310}
26311
26313 if (!Fn->hasPersonalityFn())
26315 "querying registration node size for function without personality");
26316 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26317 // WinEHStatePass for the full struct definition.
26318 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26319 case EHPersonality::MSVC_X86SEH: return 24;
26320 case EHPersonality::MSVC_CXX: return 16;
26321 default: break;
26322 }
26324 "can only recover FP for 32-bit MSVC EH personality functions");
26325}
26326
26327/// When the MSVC runtime transfers control to us, either to an outlined
26328/// function or when returning to a parent frame after catching an exception, we
26329/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26330/// Here's the math:
26331/// RegNodeBase = EntryEBP - RegNodeSize
26332/// ParentFP = RegNodeBase - ParentFrameOffset
26333/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26334/// subtracting the offset (negative on x86) takes us back to the parent FP.
26336 SDValue EntryEBP) {
26338 SDLoc dl;
26339
26340 // It's possible that the parent function no longer has a personality function
26341 // if the exceptional code was optimized away, in which case we just return
26342 // the incoming EBP.
26343 if (!Fn->hasPersonalityFn())
26344 return EntryEBP;
26345
26346 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26347 // registration, or the .set_setframe offset.
26350 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26351 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26352 SDValue ParentFrameOffset =
26353 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26354
26355 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26356 // prologue to RBP in the parent function.
26357 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26358 if (Subtarget.is64Bit())
26359 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26360
26361 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26362 // RegNodeBase = EntryEBP - RegNodeSize
26363 // ParentFP = RegNodeBase - ParentFrameOffset
26364 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26365 DAG.getConstant(RegNodeSize, dl, PtrVT));
26366 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26367}
26368
26369SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26370 SelectionDAG &DAG) const {
26371 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26372 auto isRoundModeCurDirection = [](SDValue Rnd) {
26373 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26374 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26375
26376 return false;
26377 };
26378 auto isRoundModeSAE = [](SDValue Rnd) {
26379 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26380 unsigned RC = C->getZExtValue();
26382 // Clear the NO_EXC bit and check remaining bits.
26384 // As a convenience we allow no other bits or explicitly
26385 // current direction.
26386 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26387 }
26388 }
26389
26390 return false;
26391 };
26392 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26393 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26394 RC = C->getZExtValue();
26396 // Clear the NO_EXC bit and check remaining bits.
26402 }
26403 }
26404
26405 return false;
26406 };
26407
26408 SDLoc dl(Op);
26409 unsigned IntNo = Op.getConstantOperandVal(0);
26410 MVT VT = Op.getSimpleValueType();
26411 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26412
26413 // Propagate flags from original node to transformed node(s).
26414 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26415
26416 if (IntrData) {
26417 switch(IntrData->Type) {
26418 case INTR_TYPE_1OP: {
26419 // We specify 2 possible opcodes for intrinsics with rounding modes.
26420 // First, we check if the intrinsic may have non-default rounding mode,
26421 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26422 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26423 if (IntrWithRoundingModeOpcode != 0) {
26424 SDValue Rnd = Op.getOperand(2);
26425 unsigned RC = 0;
26426 if (isRoundModeSAEToX(Rnd, RC))
26427 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26428 Op.getOperand(1),
26429 DAG.getTargetConstant(RC, dl, MVT::i32));
26430 if (!isRoundModeCurDirection(Rnd))
26431 return SDValue();
26432 }
26433 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26434 Op.getOperand(1));
26435 }
26436 case INTR_TYPE_1OP_SAE: {
26437 SDValue Sae = Op.getOperand(2);
26438
26439 unsigned Opc;
26440 if (isRoundModeCurDirection(Sae))
26441 Opc = IntrData->Opc0;
26442 else if (isRoundModeSAE(Sae))
26443 Opc = IntrData->Opc1;
26444 else
26445 return SDValue();
26446
26447 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26448 }
26449 case INTR_TYPE_2OP: {
26450 SDValue Src2 = Op.getOperand(2);
26451
26452 // We specify 2 possible opcodes for intrinsics with rounding modes.
26453 // First, we check if the intrinsic may have non-default rounding mode,
26454 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26455 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26456 if (IntrWithRoundingModeOpcode != 0) {
26457 SDValue Rnd = Op.getOperand(3);
26458 unsigned RC = 0;
26459 if (isRoundModeSAEToX(Rnd, RC))
26460 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26461 Op.getOperand(1), Src2,
26462 DAG.getTargetConstant(RC, dl, MVT::i32));
26463 if (!isRoundModeCurDirection(Rnd))
26464 return SDValue();
26465 }
26466
26467 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26468 Op.getOperand(1), Src2);
26469 }
26470 case INTR_TYPE_2OP_SAE: {
26471 SDValue Sae = Op.getOperand(3);
26472
26473 unsigned Opc;
26474 if (isRoundModeCurDirection(Sae))
26475 Opc = IntrData->Opc0;
26476 else if (isRoundModeSAE(Sae))
26477 Opc = IntrData->Opc1;
26478 else
26479 return SDValue();
26480
26481 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26482 Op.getOperand(2));
26483 }
26484 case INTR_TYPE_3OP:
26485 case INTR_TYPE_3OP_IMM8: {
26486 SDValue Src1 = Op.getOperand(1);
26487 SDValue Src2 = Op.getOperand(2);
26488 SDValue Src3 = Op.getOperand(3);
26489
26490 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26491 Src3.getValueType() != MVT::i8) {
26492 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26493 }
26494
26495 // We specify 2 possible opcodes for intrinsics with rounding modes.
26496 // First, we check if the intrinsic may have non-default rounding mode,
26497 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26498 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26499 if (IntrWithRoundingModeOpcode != 0) {
26500 SDValue Rnd = Op.getOperand(4);
26501 unsigned RC = 0;
26502 if (isRoundModeSAEToX(Rnd, RC))
26503 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26504 Src1, Src2, Src3,
26505 DAG.getTargetConstant(RC, dl, MVT::i32));
26506 if (!isRoundModeCurDirection(Rnd))
26507 return SDValue();
26508 }
26509
26510 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26511 {Src1, Src2, Src3});
26512 }
26513 case INTR_TYPE_4OP_IMM8: {
26514 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26515 SDValue Src4 = Op.getOperand(4);
26516 if (Src4.getValueType() != MVT::i8) {
26517 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26518 }
26519
26520 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26521 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26522 Src4);
26523 }
26524 case INTR_TYPE_1OP_MASK: {
26525 SDValue Src = Op.getOperand(1);
26526 SDValue PassThru = Op.getOperand(2);
26527 SDValue Mask = Op.getOperand(3);
26528 // We add rounding mode to the Node when
26529 // - RC Opcode is specified and
26530 // - RC is not "current direction".
26531 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26532 if (IntrWithRoundingModeOpcode != 0) {
26533 SDValue Rnd = Op.getOperand(4);
26534 unsigned RC = 0;
26535 if (isRoundModeSAEToX(Rnd, RC))
26536 return getVectorMaskingNode(
26537 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26538 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26539 Mask, PassThru, Subtarget, DAG);
26540 if (!isRoundModeCurDirection(Rnd))
26541 return SDValue();
26542 }
26543 return getVectorMaskingNode(
26544 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26545 Subtarget, DAG);
26546 }
26548 SDValue Src = Op.getOperand(1);
26549 SDValue PassThru = Op.getOperand(2);
26550 SDValue Mask = Op.getOperand(3);
26551 SDValue Rnd = Op.getOperand(4);
26552
26553 unsigned Opc;
26554 if (isRoundModeCurDirection(Rnd))
26555 Opc = IntrData->Opc0;
26556 else if (isRoundModeSAE(Rnd))
26557 Opc = IntrData->Opc1;
26558 else
26559 return SDValue();
26560
26561 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26562 Subtarget, DAG);
26563 }
26564 case INTR_TYPE_SCALAR_MASK: {
26565 SDValue Src1 = Op.getOperand(1);
26566 SDValue Src2 = Op.getOperand(2);
26567 SDValue passThru = Op.getOperand(3);
26568 SDValue Mask = Op.getOperand(4);
26569 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26570 // There are 2 kinds of intrinsics in this group:
26571 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26572 // (2) With rounding mode and sae - 7 operands.
26573 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26574 if (Op.getNumOperands() == (5U + HasRounding)) {
26575 if (HasRounding) {
26576 SDValue Rnd = Op.getOperand(5);
26577 unsigned RC = 0;
26578 if (isRoundModeSAEToX(Rnd, RC))
26579 return getScalarMaskingNode(
26580 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26581 DAG.getTargetConstant(RC, dl, MVT::i32)),
26582 Mask, passThru, Subtarget, DAG);
26583 if (!isRoundModeCurDirection(Rnd))
26584 return SDValue();
26585 }
26586 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26587 Src2),
26588 Mask, passThru, Subtarget, DAG);
26589 }
26590
26591 assert(Op.getNumOperands() == (6U + HasRounding) &&
26592 "Unexpected intrinsic form");
26593 SDValue RoundingMode = Op.getOperand(5);
26594 unsigned Opc = IntrData->Opc0;
26595 if (HasRounding) {
26596 SDValue Sae = Op.getOperand(6);
26597 if (isRoundModeSAE(Sae))
26598 Opc = IntrWithRoundingModeOpcode;
26599 else if (!isRoundModeCurDirection(Sae))
26600 return SDValue();
26601 }
26602 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26603 Src2, RoundingMode),
26604 Mask, passThru, Subtarget, DAG);
26605 }
26607 SDValue Src1 = Op.getOperand(1);
26608 SDValue Src2 = Op.getOperand(2);
26609 SDValue passThru = Op.getOperand(3);
26610 SDValue Mask = Op.getOperand(4);
26611 SDValue Rnd = Op.getOperand(5);
26612
26613 SDValue NewOp;
26614 unsigned RC = 0;
26615 if (isRoundModeCurDirection(Rnd))
26616 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26617 else if (isRoundModeSAEToX(Rnd, RC))
26618 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26619 DAG.getTargetConstant(RC, dl, MVT::i32));
26620 else
26621 return SDValue();
26622
26623 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26624 }
26626 SDValue Src1 = Op.getOperand(1);
26627 SDValue Src2 = Op.getOperand(2);
26628 SDValue passThru = Op.getOperand(3);
26629 SDValue Mask = Op.getOperand(4);
26630 SDValue Sae = Op.getOperand(5);
26631 unsigned Opc;
26632 if (isRoundModeCurDirection(Sae))
26633 Opc = IntrData->Opc0;
26634 else if (isRoundModeSAE(Sae))
26635 Opc = IntrData->Opc1;
26636 else
26637 return SDValue();
26638
26639 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26640 Mask, passThru, Subtarget, DAG);
26641 }
26642 case INTR_TYPE_2OP_MASK: {
26643 SDValue Src1 = Op.getOperand(1);
26644 SDValue Src2 = Op.getOperand(2);
26645 SDValue PassThru = Op.getOperand(3);
26646 SDValue Mask = Op.getOperand(4);
26647 SDValue NewOp;
26648 if (IntrData->Opc1 != 0) {
26649 SDValue Rnd = Op.getOperand(5);
26650 unsigned RC = 0;
26651 if (isRoundModeSAEToX(Rnd, RC))
26652 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26653 DAG.getTargetConstant(RC, dl, MVT::i32));
26654 else if (!isRoundModeCurDirection(Rnd))
26655 return SDValue();
26656 }
26657 if (!NewOp)
26658 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26659 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26660 }
26662 SDValue Src1 = Op.getOperand(1);
26663 SDValue Src2 = Op.getOperand(2);
26664 SDValue PassThru = Op.getOperand(3);
26665 SDValue Mask = Op.getOperand(4);
26666
26667 unsigned Opc = IntrData->Opc0;
26668 if (IntrData->Opc1 != 0) {
26669 SDValue Sae = Op.getOperand(5);
26670 if (isRoundModeSAE(Sae))
26671 Opc = IntrData->Opc1;
26672 else if (!isRoundModeCurDirection(Sae))
26673 return SDValue();
26674 }
26675
26676 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26677 Mask, PassThru, Subtarget, DAG);
26678 }
26680 SDValue Src1 = Op.getOperand(1);
26681 SDValue Src2 = Op.getOperand(2);
26682 SDValue Src3 = Op.getOperand(3);
26683 SDValue PassThru = Op.getOperand(4);
26684 SDValue Mask = Op.getOperand(5);
26685 SDValue Sae = Op.getOperand(6);
26686 unsigned Opc;
26687 if (isRoundModeCurDirection(Sae))
26688 Opc = IntrData->Opc0;
26689 else if (isRoundModeSAE(Sae))
26690 Opc = IntrData->Opc1;
26691 else
26692 return SDValue();
26693
26694 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26695 Mask, PassThru, Subtarget, DAG);
26696 }
26698 SDValue Src1 = Op.getOperand(1);
26699 SDValue Src2 = Op.getOperand(2);
26700 SDValue Src3 = Op.getOperand(3);
26701 SDValue PassThru = Op.getOperand(4);
26702 SDValue Mask = Op.getOperand(5);
26703
26704 unsigned Opc = IntrData->Opc0;
26705 if (IntrData->Opc1 != 0) {
26706 SDValue Sae = Op.getOperand(6);
26707 if (isRoundModeSAE(Sae))
26708 Opc = IntrData->Opc1;
26709 else if (!isRoundModeCurDirection(Sae))
26710 return SDValue();
26711 }
26712 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26713 Mask, PassThru, Subtarget, DAG);
26714 }
26715 case BLENDV: {
26716 SDValue Src1 = Op.getOperand(1);
26717 SDValue Src2 = Op.getOperand(2);
26718 SDValue Src3 = Op.getOperand(3);
26719
26720 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26721 Src3 = DAG.getBitcast(MaskVT, Src3);
26722
26723 // Reverse the operands to match VSELECT order.
26724 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26725 }
26726 case VPERM_2OP : {
26727 SDValue Src1 = Op.getOperand(1);
26728 SDValue Src2 = Op.getOperand(2);
26729
26730 // Swap Src1 and Src2 in the node creation
26731 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26732 }
26733 case CFMA_OP_MASKZ:
26734 case CFMA_OP_MASK: {
26735 SDValue Src1 = Op.getOperand(1);
26736 SDValue Src2 = Op.getOperand(2);
26737 SDValue Src3 = Op.getOperand(3);
26738 SDValue Mask = Op.getOperand(4);
26739 MVT VT = Op.getSimpleValueType();
26740
26741 SDValue PassThru = Src3;
26742 if (IntrData->Type == CFMA_OP_MASKZ)
26743 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26744
26745 // We add rounding mode to the Node when
26746 // - RC Opcode is specified and
26747 // - RC is not "current direction".
26748 SDValue NewOp;
26749 if (IntrData->Opc1 != 0) {
26750 SDValue Rnd = Op.getOperand(5);
26751 unsigned RC = 0;
26752 if (isRoundModeSAEToX(Rnd, RC))
26753 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26754 DAG.getTargetConstant(RC, dl, MVT::i32));
26755 else if (!isRoundModeCurDirection(Rnd))
26756 return SDValue();
26757 }
26758 if (!NewOp)
26759 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26760 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26761 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26762 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26763 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26764 }
26765 case IFMA_OP:
26766 // NOTE: We need to swizzle the operands to pass the multiply operands
26767 // first.
26768 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26769 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26770 case FPCLASSS: {
26771 SDValue Src1 = Op.getOperand(1);
26772 SDValue Imm = Op.getOperand(2);
26773 SDValue Mask = Op.getOperand(3);
26774 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26775 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26776 Subtarget, DAG);
26777 // Need to fill with zeros to ensure the bitcast will produce zeroes
26778 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26779 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26780 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26781 DAG.getVectorIdxConstant(0, dl));
26782 return DAG.getBitcast(MVT::i8, Ins);
26783 }
26784
26785 case CMP_MASK_CC: {
26786 MVT MaskVT = Op.getSimpleValueType();
26787 SDValue CC = Op.getOperand(3);
26788 SDValue Mask = Op.getOperand(4);
26789 // We specify 2 possible opcodes for intrinsics with rounding modes.
26790 // First, we check if the intrinsic may have non-default rounding mode,
26791 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26792 if (IntrData->Opc1 != 0) {
26793 SDValue Sae = Op.getOperand(5);
26794 if (isRoundModeSAE(Sae))
26795 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26796 Op.getOperand(2), CC, Mask, Sae);
26797 if (!isRoundModeCurDirection(Sae))
26798 return SDValue();
26799 }
26800 //default rounding mode
26801 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26802 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26803 }
26804 case CMP_MASK_SCALAR_CC: {
26805 SDValue Src1 = Op.getOperand(1);
26806 SDValue Src2 = Op.getOperand(2);
26807 SDValue CC = Op.getOperand(3);
26808 SDValue Mask = Op.getOperand(4);
26809
26810 SDValue Cmp;
26811 if (IntrData->Opc1 != 0) {
26812 SDValue Sae = Op.getOperand(5);
26813 if (isRoundModeSAE(Sae))
26814 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26815 else if (!isRoundModeCurDirection(Sae))
26816 return SDValue();
26817 }
26818 //default rounding mode
26819 if (!Cmp.getNode())
26820 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26821
26822 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26823 Subtarget, DAG);
26824 // Need to fill with zeros to ensure the bitcast will produce zeroes
26825 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26826 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26827 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26828 DAG.getVectorIdxConstant(0, dl));
26829 return DAG.getBitcast(MVT::i8, Ins);
26830 }
26831 case COMI: { // Comparison intrinsics
26832 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26833 SDValue LHS = Op.getOperand(1);
26834 SDValue RHS = Op.getOperand(2);
26835 // Some conditions require the operands to be swapped.
26836 if (CC == ISD::SETLT || CC == ISD::SETLE)
26837 std::swap(LHS, RHS);
26838
26839 // For AVX10.2, Support EQ and NE.
26840 bool HasAVX10_2_COMX =
26841 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26842
26843 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26844 // For BF type we need to fall back.
26845 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26846
26847 auto ComiOpCode = IntrData->Opc0;
26848 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26849
26850 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26851 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26852
26853 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26854
26855 SDValue SetCC;
26856 switch (CC) {
26857 case ISD::SETEQ: {
26858 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26859 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26860 break;
26861 // (ZF = 1 and PF = 0)
26862 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26863 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26864 break;
26865 }
26866 case ISD::SETNE: {
26867 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26868 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26869 break;
26870 // (ZF = 0 or PF = 1)
26871 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26872 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26873 break;
26874 }
26875 case ISD::SETGT: // (CF = 0 and ZF = 0)
26876 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26877 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26878 break;
26879 }
26880 case ISD::SETGE: // CF = 0
26881 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26882 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26883 break;
26884 default:
26885 llvm_unreachable("Unexpected illegal condition!");
26886 }
26887 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26888 }
26889 case COMI_RM: { // Comparison intrinsics with Sae
26890 SDValue LHS = Op.getOperand(1);
26891 SDValue RHS = Op.getOperand(2);
26892 unsigned CondVal = Op.getConstantOperandVal(3);
26893 SDValue Sae = Op.getOperand(4);
26894
26895 SDValue FCmp;
26896 if (isRoundModeCurDirection(Sae))
26897 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26898 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26899 else if (isRoundModeSAE(Sae))
26900 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26901 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26902 else
26903 return SDValue();
26904 // Need to fill with zeros to ensure the bitcast will produce zeroes
26905 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26906 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26907 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26908 DAG.getVectorIdxConstant(0, dl));
26909 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26910 DAG.getBitcast(MVT::i16, Ins));
26911 }
26912 case VSHIFT: {
26913 SDValue SrcOp = Op.getOperand(1);
26914 SDValue ShAmt = Op.getOperand(2);
26915 assert(ShAmt.getValueType() == MVT::i32 &&
26916 "Unexpected VSHIFT amount type");
26917
26918 // Catch shift-by-constant.
26919 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26920 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26921 Op.getSimpleValueType(), SrcOp,
26922 CShAmt->getZExtValue(), DAG);
26923
26924 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26925 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26926 SrcOp, ShAmt, 0, Subtarget, DAG);
26927 }
26929 SDValue Mask = Op.getOperand(3);
26930 SDValue DataToCompress = Op.getOperand(1);
26931 SDValue PassThru = Op.getOperand(2);
26932 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26933 return Op.getOperand(1);
26934
26935 // Avoid false dependency.
26936 if (PassThru.isUndef())
26937 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26938
26939 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26940 Mask);
26941 }
26942 case FIXUPIMM:
26943 case FIXUPIMM_MASKZ: {
26944 SDValue Src1 = Op.getOperand(1);
26945 SDValue Src2 = Op.getOperand(2);
26946 SDValue Src3 = Op.getOperand(3);
26947 SDValue Imm = Op.getOperand(4);
26948 SDValue Mask = Op.getOperand(5);
26949 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26950 ? Src1
26951 : getZeroVector(VT, Subtarget, DAG, dl);
26952
26953 unsigned Opc = IntrData->Opc0;
26954 if (IntrData->Opc1 != 0) {
26955 SDValue Sae = Op.getOperand(6);
26956 if (isRoundModeSAE(Sae))
26957 Opc = IntrData->Opc1;
26958 else if (!isRoundModeCurDirection(Sae))
26959 return SDValue();
26960 }
26961
26962 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26963
26965 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26966
26967 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26968 }
26969 case ROUNDP: {
26970 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26971 // Clear the upper bits of the rounding immediate so that the legacy
26972 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26973 uint64_t Round = Op.getConstantOperandVal(2);
26974 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26975 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26976 Op.getOperand(1), RoundingMode);
26977 }
26978 case ROUNDS: {
26979 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26980 // Clear the upper bits of the rounding immediate so that the legacy
26981 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26982 uint64_t Round = Op.getConstantOperandVal(3);
26983 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26984 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26985 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26986 }
26987 case BEXTRI: {
26988 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26989
26990 uint64_t Imm = Op.getConstantOperandVal(2);
26991 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26992 Op.getValueType());
26993 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26994 Op.getOperand(1), Control);
26995 }
26996 // ADC/SBB
26997 case ADX: {
26998 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26999 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27000
27001 SDValue Res;
27002 // If the carry in is zero, then we should just use ADD/SUB instead of
27003 // ADC/SBB.
27004 if (isNullConstant(Op.getOperand(1))) {
27005 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27006 Op.getOperand(3));
27007 } else {
27008 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27009 DAG.getAllOnesConstant(dl, MVT::i8));
27010 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27011 Op.getOperand(3), GenCF.getValue(1));
27012 }
27013 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27014 SDValue Results[] = { SetCC, Res };
27015 return DAG.getMergeValues(Results, dl);
27016 }
27017 case CVTPD2PS_MASK:
27018 case CVTPD2DQ_MASK:
27019 case CVTQQ2PS_MASK:
27020 case TRUNCATE_TO_REG: {
27021 SDValue Src = Op.getOperand(1);
27022 SDValue PassThru = Op.getOperand(2);
27023 SDValue Mask = Op.getOperand(3);
27024
27025 if (isAllOnesConstant(Mask))
27026 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27027
27028 MVT SrcVT = Src.getSimpleValueType();
27029 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27030 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27031 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27032 {Src, PassThru, Mask});
27033 }
27034 case TRUNCATE2_TO_REG: {
27035 SDValue Src = Op.getOperand(1);
27036 SDValue Src2 = Op.getOperand(2);
27037 SDValue PassThru = Op.getOperand(3);
27038 SDValue Mask = Op.getOperand(4);
27039
27040 if (isAllOnesConstant(Mask))
27041 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27042
27043 MVT Src2VT = Src2.getSimpleValueType();
27044 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27045 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27046 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27047 {Src, Src2, PassThru, Mask});
27048 }
27049 case CVTPS2PH_MASK: {
27050 SDValue Src = Op.getOperand(1);
27051 SDValue Rnd = Op.getOperand(2);
27052 SDValue PassThru = Op.getOperand(3);
27053 SDValue Mask = Op.getOperand(4);
27054
27055 unsigned RC = 0;
27056 unsigned Opc = IntrData->Opc0;
27057 bool SAE = Src.getValueType().is512BitVector() &&
27058 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27059 if (SAE) {
27061 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27062 }
27063
27064 if (isAllOnesConstant(Mask))
27065 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27066
27067 if (SAE)
27069 else
27070 Opc = IntrData->Opc1;
27071 MVT SrcVT = Src.getSimpleValueType();
27072 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27073 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27074 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27075 }
27076 case CVTNEPS2BF16_MASK: {
27077 SDValue Src = Op.getOperand(1);
27078 SDValue PassThru = Op.getOperand(2);
27079 SDValue Mask = Op.getOperand(3);
27080
27081 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27082 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27083
27084 // Break false dependency.
27085 if (PassThru.isUndef())
27086 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27087
27088 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27089 Mask);
27090 }
27091 default:
27092 break;
27093 }
27094 }
27095
27096 switch (IntNo) {
27097 default: return SDValue(); // Don't custom lower most intrinsics.
27098
27099 // ptest and testp intrinsics. The intrinsic these come from are designed to
27100 // return an integer value, not just an instruction so lower it to the ptest
27101 // or testp pattern and a setcc for the result.
27102 case Intrinsic::x86_avx512_ktestc_b:
27103 case Intrinsic::x86_avx512_ktestc_w:
27104 case Intrinsic::x86_avx512_ktestc_d:
27105 case Intrinsic::x86_avx512_ktestc_q:
27106 case Intrinsic::x86_avx512_ktestz_b:
27107 case Intrinsic::x86_avx512_ktestz_w:
27108 case Intrinsic::x86_avx512_ktestz_d:
27109 case Intrinsic::x86_avx512_ktestz_q:
27110 case Intrinsic::x86_sse41_ptestz:
27111 case Intrinsic::x86_sse41_ptestc:
27112 case Intrinsic::x86_sse41_ptestnzc:
27113 case Intrinsic::x86_avx_ptestz_256:
27114 case Intrinsic::x86_avx_ptestc_256:
27115 case Intrinsic::x86_avx_ptestnzc_256:
27116 case Intrinsic::x86_avx_vtestz_ps:
27117 case Intrinsic::x86_avx_vtestc_ps:
27118 case Intrinsic::x86_avx_vtestnzc_ps:
27119 case Intrinsic::x86_avx_vtestz_pd:
27120 case Intrinsic::x86_avx_vtestc_pd:
27121 case Intrinsic::x86_avx_vtestnzc_pd:
27122 case Intrinsic::x86_avx_vtestz_ps_256:
27123 case Intrinsic::x86_avx_vtestc_ps_256:
27124 case Intrinsic::x86_avx_vtestnzc_ps_256:
27125 case Intrinsic::x86_avx_vtestz_pd_256:
27126 case Intrinsic::x86_avx_vtestc_pd_256:
27127 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27128 unsigned TestOpc = X86ISD::PTEST;
27129 X86::CondCode X86CC;
27130 switch (IntNo) {
27131 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27132 case Intrinsic::x86_avx512_ktestc_b:
27133 case Intrinsic::x86_avx512_ktestc_w:
27134 case Intrinsic::x86_avx512_ktestc_d:
27135 case Intrinsic::x86_avx512_ktestc_q:
27136 // CF = 1
27137 TestOpc = X86ISD::KTEST;
27138 X86CC = X86::COND_B;
27139 break;
27140 case Intrinsic::x86_avx512_ktestz_b:
27141 case Intrinsic::x86_avx512_ktestz_w:
27142 case Intrinsic::x86_avx512_ktestz_d:
27143 case Intrinsic::x86_avx512_ktestz_q:
27144 TestOpc = X86ISD::KTEST;
27145 X86CC = X86::COND_E;
27146 break;
27147 case Intrinsic::x86_avx_vtestz_ps:
27148 case Intrinsic::x86_avx_vtestz_pd:
27149 case Intrinsic::x86_avx_vtestz_ps_256:
27150 case Intrinsic::x86_avx_vtestz_pd_256:
27151 TestOpc = X86ISD::TESTP;
27152 [[fallthrough]];
27153 case Intrinsic::x86_sse41_ptestz:
27154 case Intrinsic::x86_avx_ptestz_256:
27155 // ZF = 1
27156 X86CC = X86::COND_E;
27157 break;
27158 case Intrinsic::x86_avx_vtestc_ps:
27159 case Intrinsic::x86_avx_vtestc_pd:
27160 case Intrinsic::x86_avx_vtestc_ps_256:
27161 case Intrinsic::x86_avx_vtestc_pd_256:
27162 TestOpc = X86ISD::TESTP;
27163 [[fallthrough]];
27164 case Intrinsic::x86_sse41_ptestc:
27165 case Intrinsic::x86_avx_ptestc_256:
27166 // CF = 1
27167 X86CC = X86::COND_B;
27168 break;
27169 case Intrinsic::x86_avx_vtestnzc_ps:
27170 case Intrinsic::x86_avx_vtestnzc_pd:
27171 case Intrinsic::x86_avx_vtestnzc_ps_256:
27172 case Intrinsic::x86_avx_vtestnzc_pd_256:
27173 TestOpc = X86ISD::TESTP;
27174 [[fallthrough]];
27175 case Intrinsic::x86_sse41_ptestnzc:
27176 case Intrinsic::x86_avx_ptestnzc_256:
27177 // ZF and CF = 0
27178 X86CC = X86::COND_A;
27179 break;
27180 }
27181
27182 SDValue LHS = Op.getOperand(1);
27183 SDValue RHS = Op.getOperand(2);
27184 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27185 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27186 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27187 }
27188
27189 case Intrinsic::x86_sse42_pcmpistria128:
27190 case Intrinsic::x86_sse42_pcmpestria128:
27191 case Intrinsic::x86_sse42_pcmpistric128:
27192 case Intrinsic::x86_sse42_pcmpestric128:
27193 case Intrinsic::x86_sse42_pcmpistrio128:
27194 case Intrinsic::x86_sse42_pcmpestrio128:
27195 case Intrinsic::x86_sse42_pcmpistris128:
27196 case Intrinsic::x86_sse42_pcmpestris128:
27197 case Intrinsic::x86_sse42_pcmpistriz128:
27198 case Intrinsic::x86_sse42_pcmpestriz128: {
27199 unsigned Opcode;
27200 X86::CondCode X86CC;
27201 switch (IntNo) {
27202 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27203 case Intrinsic::x86_sse42_pcmpistria128:
27204 Opcode = X86ISD::PCMPISTR;
27205 X86CC = X86::COND_A;
27206 break;
27207 case Intrinsic::x86_sse42_pcmpestria128:
27208 Opcode = X86ISD::PCMPESTR;
27209 X86CC = X86::COND_A;
27210 break;
27211 case Intrinsic::x86_sse42_pcmpistric128:
27212 Opcode = X86ISD::PCMPISTR;
27213 X86CC = X86::COND_B;
27214 break;
27215 case Intrinsic::x86_sse42_pcmpestric128:
27216 Opcode = X86ISD::PCMPESTR;
27217 X86CC = X86::COND_B;
27218 break;
27219 case Intrinsic::x86_sse42_pcmpistrio128:
27220 Opcode = X86ISD::PCMPISTR;
27221 X86CC = X86::COND_O;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpestrio128:
27224 Opcode = X86ISD::PCMPESTR;
27225 X86CC = X86::COND_O;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpistris128:
27228 Opcode = X86ISD::PCMPISTR;
27229 X86CC = X86::COND_S;
27230 break;
27231 case Intrinsic::x86_sse42_pcmpestris128:
27232 Opcode = X86ISD::PCMPESTR;
27233 X86CC = X86::COND_S;
27234 break;
27235 case Intrinsic::x86_sse42_pcmpistriz128:
27236 Opcode = X86ISD::PCMPISTR;
27237 X86CC = X86::COND_E;
27238 break;
27239 case Intrinsic::x86_sse42_pcmpestriz128:
27240 Opcode = X86ISD::PCMPESTR;
27241 X86CC = X86::COND_E;
27242 break;
27243 }
27245 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27246 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27247 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27248 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27249 }
27250
27251 case Intrinsic::x86_sse42_pcmpistri128:
27252 case Intrinsic::x86_sse42_pcmpestri128: {
27253 unsigned Opcode;
27254 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27255 Opcode = X86ISD::PCMPISTR;
27256 else
27257 Opcode = X86ISD::PCMPESTR;
27258
27260 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27261 return DAG.getNode(Opcode, dl, VTs, NewOps);
27262 }
27263
27264 case Intrinsic::x86_sse42_pcmpistrm128:
27265 case Intrinsic::x86_sse42_pcmpestrm128: {
27266 unsigned Opcode;
27267 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27268 Opcode = X86ISD::PCMPISTR;
27269 else
27270 Opcode = X86ISD::PCMPESTR;
27271
27273 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27274 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27275 }
27276
27277 case Intrinsic::eh_sjlj_lsda: {
27278 MachineFunction &MF = DAG.getMachineFunction();
27279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27280 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27281 auto &Context = MF.getContext();
27282 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27283 Twine(MF.getFunctionNumber()));
27284 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27285 DAG.getMCSymbol(S, PtrVT));
27286 }
27287
27288 case Intrinsic::x86_seh_lsda: {
27289 // Compute the symbol for the LSDA. We know it'll get emitted later.
27290 MachineFunction &MF = DAG.getMachineFunction();
27291 SDValue Op1 = Op.getOperand(1);
27292 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27295
27296 // Generate a simple absolute symbol reference. This intrinsic is only
27297 // supported on 32-bit Windows, which isn't PIC.
27298 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27299 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27300 }
27301
27302 case Intrinsic::eh_recoverfp: {
27303 SDValue FnOp = Op.getOperand(1);
27304 SDValue IncomingFPOp = Op.getOperand(2);
27305 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27306 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27307 if (!Fn)
27309 "llvm.eh.recoverfp must take a function as the first argument");
27310 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27311 }
27312
27313 case Intrinsic::localaddress: {
27314 // Returns one of the stack, base, or frame pointer registers, depending on
27315 // which is used to reference local variables.
27316 MachineFunction &MF = DAG.getMachineFunction();
27317 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27318 Register Reg;
27319 if (RegInfo->hasBasePointer(MF))
27320 Reg = RegInfo->getBaseRegister();
27321 else { // Handles the SP or FP case.
27322 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27323 if (CantUseFP)
27324 Reg = RegInfo->getPtrSizedStackRegister(MF);
27325 else
27326 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27327 }
27328 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27329 }
27330 case Intrinsic::x86_avx512_vp2intersect_q_512:
27331 case Intrinsic::x86_avx512_vp2intersect_q_256:
27332 case Intrinsic::x86_avx512_vp2intersect_q_128:
27333 case Intrinsic::x86_avx512_vp2intersect_d_512:
27334 case Intrinsic::x86_avx512_vp2intersect_d_256:
27335 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27336 SDLoc DL(Op);
27337 MVT MaskVT = Op.getSimpleValueType();
27338 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27340 Op.getOperand(1), Op.getOperand(2));
27341 SDValue Result0 =
27342 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27343 SDValue Result1 =
27344 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27345 return DAG.getMergeValues({Result0, Result1}, DL);
27346 }
27347 case Intrinsic::x86_mmx_pslli_w:
27348 case Intrinsic::x86_mmx_pslli_d:
27349 case Intrinsic::x86_mmx_pslli_q:
27350 case Intrinsic::x86_mmx_psrli_w:
27351 case Intrinsic::x86_mmx_psrli_d:
27352 case Intrinsic::x86_mmx_psrli_q:
27353 case Intrinsic::x86_mmx_psrai_w:
27354 case Intrinsic::x86_mmx_psrai_d: {
27355 SDLoc DL(Op);
27356 SDValue ShAmt = Op.getOperand(2);
27357 // If the argument is a constant, convert it to a target constant.
27358 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27359 // Clamp out of bounds shift amounts since they will otherwise be masked
27360 // to 8-bits which may make it no longer out of bounds.
27361 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27362 if (ShiftAmount == 0)
27363 return Op.getOperand(1);
27364
27365 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27366 Op.getOperand(0), Op.getOperand(1),
27367 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27368 }
27369
27370 unsigned NewIntrinsic;
27371 switch (IntNo) {
27372 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27373 case Intrinsic::x86_mmx_pslli_w:
27374 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27375 break;
27376 case Intrinsic::x86_mmx_pslli_d:
27377 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27378 break;
27379 case Intrinsic::x86_mmx_pslli_q:
27380 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27381 break;
27382 case Intrinsic::x86_mmx_psrli_w:
27383 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27384 break;
27385 case Intrinsic::x86_mmx_psrli_d:
27386 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27387 break;
27388 case Intrinsic::x86_mmx_psrli_q:
27389 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27390 break;
27391 case Intrinsic::x86_mmx_psrai_w:
27392 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27393 break;
27394 case Intrinsic::x86_mmx_psrai_d:
27395 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27396 break;
27397 }
27398
27399 // The vector shift intrinsics with scalars uses 32b shift amounts but
27400 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27401 // MMX register.
27402 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27403 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27404 DAG.getTargetConstant(NewIntrinsic, DL,
27406 Op.getOperand(1), ShAmt);
27407 }
27408 case Intrinsic::thread_pointer: {
27409 if (Subtarget.isTargetELF()) {
27410 SDLoc dl(Op);
27411 EVT PtrVT = Op.getValueType();
27412 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27414 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27415 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27416 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27417 }
27419 "Target OS doesn't support __builtin_thread_pointer() yet.");
27420 }
27421 }
27422}
27423
27425 SDValue Src, SDValue Mask, SDValue Base,
27426 SDValue Index, SDValue ScaleOp, SDValue Chain,
27427 const X86Subtarget &Subtarget) {
27428 SDLoc dl(Op);
27429 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27430 // Scale must be constant.
27431 if (!C)
27432 return SDValue();
27433 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27434 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27435 TLI.getPointerTy(DAG.getDataLayout()));
27436 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27437 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27438 // If source is undef or we know it won't be used, use a zero vector
27439 // to break register dependency.
27440 // TODO: use undef instead and let BreakFalseDeps deal with it?
27441 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27442 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27443
27444 // Cast mask to an integer type.
27445 Mask = DAG.getBitcast(MaskVT, Mask);
27446
27448
27449 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27450 SDValue Res =
27452 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27453 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27454}
27455
27457 SDValue Src, SDValue Mask, SDValue Base,
27458 SDValue Index, SDValue ScaleOp, SDValue Chain,
27459 const X86Subtarget &Subtarget) {
27460 MVT VT = Op.getSimpleValueType();
27461 SDLoc dl(Op);
27462 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27463 // Scale must be constant.
27464 if (!C)
27465 return SDValue();
27466 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27467 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27468 TLI.getPointerTy(DAG.getDataLayout()));
27469 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27471 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27472
27473 // We support two versions of the gather intrinsics. One with scalar mask and
27474 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27475 if (Mask.getValueType() != MaskVT)
27476 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27477
27478 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27479 // If source is undef or we know it won't be used, use a zero vector
27480 // to break register dependency.
27481 // TODO: use undef instead and let BreakFalseDeps deal with it?
27482 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27483 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27484
27486
27487 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27488 SDValue Res =
27490 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27491 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27492}
27493
27495 SDValue Src, SDValue Mask, SDValue Base,
27496 SDValue Index, SDValue ScaleOp, SDValue Chain,
27497 const X86Subtarget &Subtarget) {
27498 SDLoc dl(Op);
27499 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27500 // Scale must be constant.
27501 if (!C)
27502 return SDValue();
27503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27504 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27505 TLI.getPointerTy(DAG.getDataLayout()));
27506 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27507 Src.getSimpleValueType().getVectorNumElements());
27508 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27509
27510 // We support two versions of the scatter intrinsics. One with scalar mask and
27511 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27512 if (Mask.getValueType() != MaskVT)
27513 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27514
27516
27517 SDVTList VTs = DAG.getVTList(MVT::Other);
27518 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27519 SDValue Res =
27521 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27522 return Res;
27523}
27524
27526 SDValue Mask, SDValue Base, SDValue Index,
27527 SDValue ScaleOp, SDValue Chain,
27528 const X86Subtarget &Subtarget) {
27529 SDLoc dl(Op);
27530 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27531 // Scale must be constant.
27532 if (!C)
27533 return SDValue();
27534 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27535 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27536 TLI.getPointerTy(DAG.getDataLayout()));
27537 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27538 SDValue Segment = DAG.getRegister(0, MVT::i32);
27539 MVT MaskVT =
27540 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27541 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27542 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27543 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27544 return SDValue(Res, 0);
27545}
27546
27547/// Handles the lowering of builtin intrinsics with chain that return their
27548/// value into registers EDX:EAX.
27549/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27550/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27551/// TargetOpcode.
27552/// Returns a Glue value which can be used to add extra copy-from-reg if the
27553/// expanded intrinsics implicitly defines extra registers (i.e. not just
27554/// EDX:EAX).
27556 SelectionDAG &DAG,
27557 unsigned TargetOpcode,
27558 unsigned SrcReg,
27559 const X86Subtarget &Subtarget,
27561 SDValue Chain = N->getOperand(0);
27562 SDValue Glue;
27563
27564 if (SrcReg) {
27565 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27566 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27567 Glue = Chain.getValue(1);
27568 }
27569
27570 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27571 SDValue N1Ops[] = {Chain, Glue};
27572 SDNode *N1 = DAG.getMachineNode(
27573 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27574 Chain = SDValue(N1, 0);
27575
27576 // Reads the content of XCR and returns it in registers EDX:EAX.
27577 SDValue LO, HI;
27578 if (Subtarget.is64Bit()) {
27579 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27580 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27581 LO.getValue(2));
27582 } else {
27583 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27584 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27585 LO.getValue(2));
27586 }
27587 Chain = HI.getValue(1);
27588 Glue = HI.getValue(2);
27589
27590 if (Subtarget.is64Bit()) {
27591 // Merge the two 32-bit values into a 64-bit one.
27592 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27593 DAG.getConstant(32, DL, MVT::i8));
27594 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27595 Results.push_back(Chain);
27596 return Glue;
27597 }
27598
27599 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27600 SDValue Ops[] = { LO, HI };
27601 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27602 Results.push_back(Pair);
27603 Results.push_back(Chain);
27604 return Glue;
27605}
27606
27607/// Handles the lowering of builtin intrinsics that read the time stamp counter
27608/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27609/// READCYCLECOUNTER nodes.
27610static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27611 SelectionDAG &DAG,
27612 const X86Subtarget &Subtarget,
27614 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27615 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27616 // and the EAX register is loaded with the low-order 32 bits.
27617 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27618 /* NoRegister */0, Subtarget,
27619 Results);
27620 if (Opcode != X86::RDTSCP)
27621 return;
27622
27623 SDValue Chain = Results[1];
27624 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27625 // the ECX register. Add 'ecx' explicitly to the chain.
27626 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27627 Results[1] = ecx;
27628 Results.push_back(ecx.getValue(1));
27629}
27630
27632 SelectionDAG &DAG) {
27634 SDLoc DL(Op);
27635 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27636 Results);
27637 return DAG.getMergeValues(Results, DL);
27638}
27639
27642 SDValue Chain = Op.getOperand(0);
27643 SDValue RegNode = Op.getOperand(2);
27644 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27645 if (!EHInfo)
27646 report_fatal_error("EH registrations only live in functions using WinEH");
27647
27648 // Cast the operand to an alloca, and remember the frame index.
27649 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27650 if (!FINode)
27651 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27652 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27653
27654 // Return the chain operand without making any DAG nodes.
27655 return Chain;
27656}
27657
27660 SDValue Chain = Op.getOperand(0);
27661 SDValue EHGuard = Op.getOperand(2);
27662 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27663 if (!EHInfo)
27664 report_fatal_error("EHGuard only live in functions using WinEH");
27665
27666 // Cast the operand to an alloca, and remember the frame index.
27667 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27668 if (!FINode)
27669 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27670 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27671
27672 // Return the chain operand without making any DAG nodes.
27673 return Chain;
27674}
27675
27676/// Emit Truncating Store with signed or unsigned saturation.
27677static SDValue
27678EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27679 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27680 SelectionDAG &DAG) {
27681 SDVTList VTs = DAG.getVTList(MVT::Other);
27682 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27683 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27684 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27685 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27686}
27687
27688/// Emit Masked Truncating Store with signed or unsigned saturation.
27689static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27690 const SDLoc &DL,
27691 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27692 MachineMemOperand *MMO, SelectionDAG &DAG) {
27693 SDVTList VTs = DAG.getVTList(MVT::Other);
27694 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27695 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27696 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27697}
27698
27700 const MachineFunction &MF) {
27701 if (!Subtarget.is64Bit())
27702 return false;
27703 // 64-bit targets support extended Swift async frame setup,
27704 // except for targets that use the windows 64 prologue.
27705 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27706}
27707
27709 SelectionDAG &DAG) {
27710 unsigned IntNo = Op.getConstantOperandVal(1);
27711 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27712 if (!IntrData) {
27713 switch (IntNo) {
27714
27715 case Intrinsic::swift_async_context_addr: {
27716 SDLoc dl(Op);
27717 auto &MF = DAG.getMachineFunction();
27718 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27719 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27721 X86FI->setHasSwiftAsyncContext(true);
27722 SDValue Chain = Op->getOperand(0);
27723 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27724 SDValue Result =
27725 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27726 DAG.getTargetConstant(8, dl, MVT::i32)),
27727 0);
27728 // Return { result, chain }.
27729 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27730 CopyRBP.getValue(1));
27731 } else {
27732 // No special extended frame, create or reuse an existing stack slot.
27733 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27734 if (!X86FI->getSwiftAsyncContextFrameIdx())
27735 X86FI->setSwiftAsyncContextFrameIdx(
27736 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27737 false));
27738 SDValue Result =
27739 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27740 PtrSize == 8 ? MVT::i64 : MVT::i32);
27741 // Return { result, chain }.
27742 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27743 Op->getOperand(0));
27744 }
27745 }
27746
27747 case llvm::Intrinsic::x86_seh_ehregnode:
27748 return MarkEHRegistrationNode(Op, DAG);
27749 case llvm::Intrinsic::x86_seh_ehguard:
27750 return MarkEHGuard(Op, DAG);
27751 case llvm::Intrinsic::x86_rdpkru: {
27752 SDLoc dl(Op);
27753 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27754 // Create a RDPKRU node and pass 0 to the ECX parameter.
27755 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27756 DAG.getConstant(0, dl, MVT::i32));
27757 }
27758 case llvm::Intrinsic::x86_wrpkru: {
27759 SDLoc dl(Op);
27760 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27761 // to the EDX and ECX parameters.
27762 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27763 Op.getOperand(0), Op.getOperand(2),
27764 DAG.getConstant(0, dl, MVT::i32),
27765 DAG.getConstant(0, dl, MVT::i32));
27766 }
27767 case llvm::Intrinsic::asan_check_memaccess: {
27768 // Mark this as adjustsStack because it will be lowered to a call.
27770 // Don't do anything here, we will expand these intrinsics out later.
27771 return Op;
27772 }
27773 case llvm::Intrinsic::x86_flags_read_u32:
27774 case llvm::Intrinsic::x86_flags_read_u64:
27775 case llvm::Intrinsic::x86_flags_write_u32:
27776 case llvm::Intrinsic::x86_flags_write_u64: {
27777 // We need a frame pointer because this will get lowered to a PUSH/POP
27778 // sequence.
27781 // Don't do anything here, we will expand these intrinsics out later
27782 // during FinalizeISel in EmitInstrWithCustomInserter.
27783 return Op;
27784 }
27785 case Intrinsic::x86_lwpins32:
27786 case Intrinsic::x86_lwpins64:
27787 case Intrinsic::x86_umwait:
27788 case Intrinsic::x86_tpause: {
27789 SDLoc dl(Op);
27790 SDValue Chain = Op->getOperand(0);
27791 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27792 unsigned Opcode;
27793
27794 switch (IntNo) {
27795 default: llvm_unreachable("Impossible intrinsic");
27796 case Intrinsic::x86_umwait:
27797 Opcode = X86ISD::UMWAIT;
27798 break;
27799 case Intrinsic::x86_tpause:
27800 Opcode = X86ISD::TPAUSE;
27801 break;
27802 case Intrinsic::x86_lwpins32:
27803 case Intrinsic::x86_lwpins64:
27804 Opcode = X86ISD::LWPINS;
27805 break;
27806 }
27807
27809 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27810 Op->getOperand(3), Op->getOperand(4));
27811 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27812 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27813 Operation.getValue(1));
27814 }
27815 case Intrinsic::x86_enqcmd:
27816 case Intrinsic::x86_enqcmds: {
27817 SDLoc dl(Op);
27818 SDValue Chain = Op.getOperand(0);
27819 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27820 unsigned Opcode;
27821 switch (IntNo) {
27822 default: llvm_unreachable("Impossible intrinsic!");
27823 case Intrinsic::x86_enqcmd:
27824 Opcode = X86ISD::ENQCMD;
27825 break;
27826 case Intrinsic::x86_enqcmds:
27827 Opcode = X86ISD::ENQCMDS;
27828 break;
27829 }
27830 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27831 Op.getOperand(3));
27832 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27833 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27834 Operation.getValue(1));
27835 }
27836 case Intrinsic::x86_aesenc128kl:
27837 case Intrinsic::x86_aesdec128kl:
27838 case Intrinsic::x86_aesenc256kl:
27839 case Intrinsic::x86_aesdec256kl: {
27840 SDLoc DL(Op);
27841 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27842 SDValue Chain = Op.getOperand(0);
27843 unsigned Opcode;
27844
27845 switch (IntNo) {
27846 default: llvm_unreachable("Impossible intrinsic");
27847 case Intrinsic::x86_aesenc128kl:
27848 Opcode = X86ISD::AESENC128KL;
27849 break;
27850 case Intrinsic::x86_aesdec128kl:
27851 Opcode = X86ISD::AESDEC128KL;
27852 break;
27853 case Intrinsic::x86_aesenc256kl:
27854 Opcode = X86ISD::AESENC256KL;
27855 break;
27856 case Intrinsic::x86_aesdec256kl:
27857 Opcode = X86ISD::AESDEC256KL;
27858 break;
27859 }
27860
27862 MachineMemOperand *MMO = MemIntr->getMemOperand();
27863 EVT MemVT = MemIntr->getMemoryVT();
27865 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27866 MMO);
27867 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27868
27869 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27870 {ZF, Operation.getValue(0), Operation.getValue(2)});
27871 }
27872 case Intrinsic::x86_aesencwide128kl:
27873 case Intrinsic::x86_aesdecwide128kl:
27874 case Intrinsic::x86_aesencwide256kl:
27875 case Intrinsic::x86_aesdecwide256kl: {
27876 SDLoc DL(Op);
27877 SDVTList VTs = DAG.getVTList(
27878 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27879 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27880 SDValue Chain = Op.getOperand(0);
27881 unsigned Opcode;
27882
27883 switch (IntNo) {
27884 default: llvm_unreachable("Impossible intrinsic");
27885 case Intrinsic::x86_aesencwide128kl:
27886 Opcode = X86ISD::AESENCWIDE128KL;
27887 break;
27888 case Intrinsic::x86_aesdecwide128kl:
27889 Opcode = X86ISD::AESDECWIDE128KL;
27890 break;
27891 case Intrinsic::x86_aesencwide256kl:
27892 Opcode = X86ISD::AESENCWIDE256KL;
27893 break;
27894 case Intrinsic::x86_aesdecwide256kl:
27895 Opcode = X86ISD::AESDECWIDE256KL;
27896 break;
27897 }
27898
27900 MachineMemOperand *MMO = MemIntr->getMemOperand();
27901 EVT MemVT = MemIntr->getMemoryVT();
27903 Opcode, DL, VTs,
27904 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27905 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27906 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27907 MemVT, MMO);
27908 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27909
27910 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27911 {ZF, Operation.getValue(1), Operation.getValue(2),
27912 Operation.getValue(3), Operation.getValue(4),
27913 Operation.getValue(5), Operation.getValue(6),
27914 Operation.getValue(7), Operation.getValue(8),
27915 Operation.getValue(9)});
27916 }
27917 case Intrinsic::x86_testui: {
27918 SDLoc dl(Op);
27919 SDValue Chain = Op.getOperand(0);
27920 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27921 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27922 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27923 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27924 Operation.getValue(1));
27925 }
27926 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27927 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27928 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27929 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27930 case Intrinsic::x86_t2rpntlvwz0_internal:
27931 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27932 case Intrinsic::x86_t2rpntlvwz1_internal:
27933 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27934 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27936 unsigned IntNo = Op.getConstantOperandVal(1);
27937 unsigned Opc = 0;
27938 switch (IntNo) {
27939 default:
27940 llvm_unreachable("Unexpected intrinsic!");
27941 case Intrinsic::x86_t2rpntlvwz0_internal:
27942 Opc = X86::PT2RPNTLVWZ0V;
27943 break;
27944 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27945 Opc = X86::PT2RPNTLVWZ0T1V;
27946 break;
27947 case Intrinsic::x86_t2rpntlvwz1_internal:
27948 Opc = X86::PT2RPNTLVWZ1V;
27949 break;
27950 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27951 Opc = X86::PT2RPNTLVWZ1T1V;
27952 break;
27953 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27954 Opc = X86::PT2RPNTLVWZ0RSV;
27955 break;
27956 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27957 Opc = X86::PT2RPNTLVWZ0RST1V;
27958 break;
27959 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27960 Opc = X86::PT2RPNTLVWZ1RSV;
27961 break;
27962 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27963 Opc = X86::PT2RPNTLVWZ1RST1V;
27964 break;
27965 }
27966
27967 SDLoc DL(Op);
27968 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27969
27970 SDValue Ops[] = {Op.getOperand(2), // Row
27971 Op.getOperand(3), // Col0
27972 Op.getOperand(4), // Col1
27973 Op.getOperand(5), // Base
27974 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27975 Op.getOperand(6), // Index
27976 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27977 DAG.getRegister(0, MVT::i16), // Segment
27978 Op.getOperand(0)}; // Chain
27979
27980 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27981 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27982 SDValue(Res, 0));
27983 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27984 SDValue(Res, 0));
27985 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27986 }
27987 case Intrinsic::x86_atomic_bts_rm:
27988 case Intrinsic::x86_atomic_btc_rm:
27989 case Intrinsic::x86_atomic_btr_rm: {
27990 SDLoc DL(Op);
27991 MVT VT = Op.getSimpleValueType();
27992 SDValue Chain = Op.getOperand(0);
27993 SDValue Op1 = Op.getOperand(2);
27994 SDValue Op2 = Op.getOperand(3);
27995 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27996 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27998 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27999 SDValue Res =
28000 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28001 {Chain, Op1, Op2}, VT, MMO);
28002 Chain = Res.getValue(1);
28003 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28004 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28005 }
28006 case Intrinsic::x86_atomic_bts:
28007 case Intrinsic::x86_atomic_btc:
28008 case Intrinsic::x86_atomic_btr: {
28009 SDLoc DL(Op);
28010 MVT VT = Op.getSimpleValueType();
28011 SDValue Chain = Op.getOperand(0);
28012 SDValue Op1 = Op.getOperand(2);
28013 SDValue Op2 = Op.getOperand(3);
28014 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28015 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28016 : X86ISD::LBTR;
28017 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28018 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28019 SDValue Res =
28020 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28021 {Chain, Op1, Op2, Size}, VT, MMO);
28022 Chain = Res.getValue(1);
28023 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28024 unsigned Imm = Op2->getAsZExtVal();
28025 if (Imm)
28026 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28027 DAG.getShiftAmountConstant(Imm, VT, DL));
28028 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28029 }
28030 case Intrinsic::x86_cmpccxadd32:
28031 case Intrinsic::x86_cmpccxadd64: {
28032 SDLoc DL(Op);
28033 SDValue Chain = Op.getOperand(0);
28034 SDValue Addr = Op.getOperand(2);
28035 SDValue Src1 = Op.getOperand(3);
28036 SDValue Src2 = Op.getOperand(4);
28037 SDValue CC = Op.getOperand(5);
28038 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28040 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28041 MVT::i32, MMO);
28042 return Operation;
28043 }
28044 case Intrinsic::x86_aadd32:
28045 case Intrinsic::x86_aadd64:
28046 case Intrinsic::x86_aand32:
28047 case Intrinsic::x86_aand64:
28048 case Intrinsic::x86_aor32:
28049 case Intrinsic::x86_aor64:
28050 case Intrinsic::x86_axor32:
28051 case Intrinsic::x86_axor64: {
28052 SDLoc DL(Op);
28053 SDValue Chain = Op.getOperand(0);
28054 SDValue Op1 = Op.getOperand(2);
28055 SDValue Op2 = Op.getOperand(3);
28056 MVT VT = Op2.getSimpleValueType();
28057 unsigned Opc = 0;
28058 switch (IntNo) {
28059 default:
28060 llvm_unreachable("Unknown Intrinsic");
28061 case Intrinsic::x86_aadd32:
28062 case Intrinsic::x86_aadd64:
28063 Opc = X86ISD::AADD;
28064 break;
28065 case Intrinsic::x86_aand32:
28066 case Intrinsic::x86_aand64:
28067 Opc = X86ISD::AAND;
28068 break;
28069 case Intrinsic::x86_aor32:
28070 case Intrinsic::x86_aor64:
28071 Opc = X86ISD::AOR;
28072 break;
28073 case Intrinsic::x86_axor32:
28074 case Intrinsic::x86_axor64:
28075 Opc = X86ISD::AXOR;
28076 break;
28077 }
28078 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28079 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28080 {Chain, Op1, Op2}, VT, MMO);
28081 }
28082 case Intrinsic::x86_atomic_add_cc:
28083 case Intrinsic::x86_atomic_sub_cc:
28084 case Intrinsic::x86_atomic_or_cc:
28085 case Intrinsic::x86_atomic_and_cc:
28086 case Intrinsic::x86_atomic_xor_cc: {
28087 SDLoc DL(Op);
28088 SDValue Chain = Op.getOperand(0);
28089 SDValue Op1 = Op.getOperand(2);
28090 SDValue Op2 = Op.getOperand(3);
28091 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28092 MVT VT = Op2.getSimpleValueType();
28093 unsigned Opc = 0;
28094 switch (IntNo) {
28095 default:
28096 llvm_unreachable("Unknown Intrinsic");
28097 case Intrinsic::x86_atomic_add_cc:
28098 Opc = X86ISD::LADD;
28099 break;
28100 case Intrinsic::x86_atomic_sub_cc:
28101 Opc = X86ISD::LSUB;
28102 break;
28103 case Intrinsic::x86_atomic_or_cc:
28104 Opc = X86ISD::LOR;
28105 break;
28106 case Intrinsic::x86_atomic_and_cc:
28107 Opc = X86ISD::LAND;
28108 break;
28109 case Intrinsic::x86_atomic_xor_cc:
28110 Opc = X86ISD::LXOR;
28111 break;
28112 }
28113 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28114 SDValue LockArith =
28115 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28116 {Chain, Op1, Op2}, VT, MMO);
28117 Chain = LockArith.getValue(1);
28118 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28119 }
28120 }
28121 return SDValue();
28122 }
28123
28124 SDLoc dl(Op);
28125 switch(IntrData->Type) {
28126 default: llvm_unreachable("Unknown Intrinsic Type");
28127 case RDSEED:
28128 case RDRAND: {
28129 // Emit the node with the right value type.
28130 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28131 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28132
28133 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28134 // Otherwise return the value from Rand, which is always 0, casted to i32.
28135 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28136 DAG.getConstant(1, dl, Op->getValueType(1)),
28137 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28138 SDValue(Result.getNode(), 1)};
28139 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28140
28141 // Return { result, isValid, chain }.
28142 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28143 SDValue(Result.getNode(), 2));
28144 }
28145 case GATHER_AVX2: {
28146 SDValue Chain = Op.getOperand(0);
28147 SDValue Src = Op.getOperand(2);
28148 SDValue Base = Op.getOperand(3);
28149 SDValue Index = Op.getOperand(4);
28150 SDValue Mask = Op.getOperand(5);
28151 SDValue Scale = Op.getOperand(6);
28152 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28153 Scale, Chain, Subtarget);
28154 }
28155 case GATHER: {
28156 //gather(v1, mask, index, base, scale);
28157 SDValue Chain = Op.getOperand(0);
28158 SDValue Src = Op.getOperand(2);
28159 SDValue Base = Op.getOperand(3);
28160 SDValue Index = Op.getOperand(4);
28161 SDValue Mask = Op.getOperand(5);
28162 SDValue Scale = Op.getOperand(6);
28163 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28164 Chain, Subtarget);
28165 }
28166 case SCATTER: {
28167 //scatter(base, mask, index, v1, scale);
28168 SDValue Chain = Op.getOperand(0);
28169 SDValue Base = Op.getOperand(2);
28170 SDValue Mask = Op.getOperand(3);
28171 SDValue Index = Op.getOperand(4);
28172 SDValue Src = Op.getOperand(5);
28173 SDValue Scale = Op.getOperand(6);
28174 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28175 Scale, Chain, Subtarget);
28176 }
28177 case PREFETCH: {
28178 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28179 assert((HintVal == 2 || HintVal == 3) &&
28180 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28181 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28182 SDValue Chain = Op.getOperand(0);
28183 SDValue Mask = Op.getOperand(2);
28184 SDValue Index = Op.getOperand(3);
28185 SDValue Base = Op.getOperand(4);
28186 SDValue Scale = Op.getOperand(5);
28187 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28188 Subtarget);
28189 }
28190 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28191 case RDTSC: {
28193 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28194 Results);
28195 return DAG.getMergeValues(Results, dl);
28196 }
28197 // Read Performance Monitoring Counters.
28198 case RDPMC:
28199 // Read Processor Register.
28200 case RDPRU:
28201 // GetExtended Control Register.
28202 case XGETBV: {
28204
28205 // RDPMC uses ECX to select the index of the performance counter to read.
28206 // RDPRU uses ECX to select the processor register to read.
28207 // XGETBV uses ECX to select the index of the XCR register to return.
28208 // The result is stored into registers EDX:EAX.
28209 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28210 Subtarget, Results);
28211 return DAG.getMergeValues(Results, dl);
28212 }
28213 // XTEST intrinsics.
28214 case XTEST: {
28215 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28216 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28217
28218 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28219 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28220 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28221 Ret, SDValue(InTrans.getNode(), 1));
28222 }
28225 case TRUNCATE_TO_MEM_VI32: {
28226 SDValue Mask = Op.getOperand(4);
28227 SDValue DataToTruncate = Op.getOperand(3);
28228 SDValue Addr = Op.getOperand(2);
28229 SDValue Chain = Op.getOperand(0);
28230
28232 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28233
28234 EVT MemVT = MemIntr->getMemoryVT();
28235
28236 uint16_t TruncationOp = IntrData->Opc0;
28237 switch (TruncationOp) {
28238 case X86ISD::VTRUNC: {
28239 if (isAllOnesConstant(Mask)) // return just a truncate store
28240 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28241 MemIntr->getMemOperand());
28242
28243 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28244 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28245 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28246
28247 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28248 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28249 true /* truncating */);
28250 }
28251 case X86ISD::VTRUNCUS:
28252 case X86ISD::VTRUNCS: {
28253 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28254 if (isAllOnesConstant(Mask))
28255 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28256 MemIntr->getMemOperand(), DAG);
28257
28258 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28259 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28260
28261 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28262 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28263 }
28264 default:
28265 llvm_unreachable("Unsupported truncstore intrinsic");
28266 }
28267 }
28268 case INTR_TYPE_CAST_MMX:
28269 return SDValue(); // handled in combineINTRINSIC_*
28270 }
28271}
28272
28273SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28274 SelectionDAG &DAG) const {
28275 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28276 MFI.setReturnAddressIsTaken(true);
28277
28278 unsigned Depth = Op.getConstantOperandVal(0);
28279 SDLoc dl(Op);
28280 EVT PtrVT = Op.getValueType();
28281
28282 if (Depth > 0) {
28283 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28284 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28285 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28286 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28287 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28288 MachinePointerInfo());
28289 }
28290
28291 // Just load the return address.
28292 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28293 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28294 MachinePointerInfo());
28295}
28296
28297SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28298 SelectionDAG &DAG) const {
28300 return getReturnAddressFrameIndex(DAG);
28301}
28302
28303SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28304 MachineFunction &MF = DAG.getMachineFunction();
28305 MachineFrameInfo &MFI = MF.getFrameInfo();
28306 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28307 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28308 EVT VT = Op.getValueType();
28309
28310 MFI.setFrameAddressIsTaken(true);
28311
28312 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28313 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28314 // is not possible to crawl up the stack without looking at the unwind codes
28315 // simultaneously.
28316 int FrameAddrIndex = FuncInfo->getFAIndex();
28317 if (!FrameAddrIndex) {
28318 // Set up a frame object for the return address.
28319 unsigned SlotSize = RegInfo->getSlotSize();
28320 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28321 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28322 FuncInfo->setFAIndex(FrameAddrIndex);
28323 }
28324 return DAG.getFrameIndex(FrameAddrIndex, VT);
28325 }
28326
28327 Register FrameReg =
28328 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28329 SDLoc dl(Op); // FIXME probably not meaningful
28330 unsigned Depth = Op.getConstantOperandVal(0);
28331 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28332 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28333 "Invalid Frame Register!");
28334 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28335 while (Depth--)
28336 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28337 MachinePointerInfo());
28338 return FrameAddr;
28339}
28340
28341// FIXME? Maybe this could be a TableGen attribute on some registers and
28342// this table could be generated automatically from RegInfo.
28344 const MachineFunction &MF) const {
28345 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28346
28348 .Case("esp", X86::ESP)
28349 .Case("rsp", X86::RSP)
28350 .Case("ebp", X86::EBP)
28351 .Case("rbp", X86::RBP)
28352 .Case("r14", X86::R14)
28353 .Case("r15", X86::R15)
28354 .Default(0);
28355
28356 if (Reg == X86::EBP || Reg == X86::RBP) {
28357 if (!TFI.hasFP(MF))
28358 report_fatal_error("register " + StringRef(RegName) +
28359 " is allocatable: function has no frame pointer");
28360#ifndef NDEBUG
28361 else {
28362 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28363 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28364 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28365 "Invalid Frame Register!");
28366 }
28367#endif
28368 }
28369
28370 return Reg;
28371}
28372
28373SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28374 SelectionDAG &DAG) const {
28375 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28376 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28377}
28378
28380 const Constant *PersonalityFn) const {
28381 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28382 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28383
28384 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28385}
28386
28388 const Constant *PersonalityFn) const {
28389 // Funclet personalities don't use selectors (the runtime does the selection).
28391 return X86::NoRegister;
28392 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28393}
28394
28396 return Subtarget.isTargetWin64();
28397}
28398
28399SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28400 SDValue Chain = Op.getOperand(0);
28401 SDValue Offset = Op.getOperand(1);
28402 SDValue Handler = Op.getOperand(2);
28403 SDLoc dl (Op);
28404
28405 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28406 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28407 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28408 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28409 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28410 "Invalid Frame Register!");
28411 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28412 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28413
28414 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28415 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28416 dl));
28417 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28418 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28419 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28420
28421 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28422 DAG.getRegister(StoreAddrReg, PtrVT));
28423}
28424
28425SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28426 SelectionDAG &DAG) const {
28427 SDLoc DL(Op);
28428 // If the subtarget is not 64bit, we may need the global base reg
28429 // after isel expand pseudo, i.e., after CGBR pass ran.
28430 // Therefore, ask for the GlobalBaseReg now, so that the pass
28431 // inserts the code for us in case we need it.
28432 // Otherwise, we will end up in a situation where we will
28433 // reference a virtual register that is not defined!
28434 if (!Subtarget.is64Bit()) {
28435 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28436 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28437 }
28438 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28439 DAG.getVTList(MVT::i32, MVT::Other),
28440 Op.getOperand(0), Op.getOperand(1));
28441}
28442
28443SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28444 SelectionDAG &DAG) const {
28445 SDLoc DL(Op);
28446 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28447 Op.getOperand(0), Op.getOperand(1));
28448}
28449
28450SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28451 SelectionDAG &DAG) const {
28452 SDLoc DL(Op);
28453 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28454 Op.getOperand(0));
28455}
28456
28458 return Op.getOperand(0);
28459}
28460
28461SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28462 SelectionDAG &DAG) const {
28463 SDValue Root = Op.getOperand(0);
28464 SDValue Trmp = Op.getOperand(1); // trampoline
28465 SDValue FPtr = Op.getOperand(2); // nested function
28466 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28467 SDLoc dl (Op);
28468
28469 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28470 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28471
28472 if (Subtarget.is64Bit()) {
28473 SDValue OutChains[6];
28474
28475 // Large code-model.
28476 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28477 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28478
28479 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28480 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28481
28482 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28483
28484 // Load the pointer to the nested function into R11.
28485 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28486 SDValue Addr = Trmp;
28487 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28488 Addr, MachinePointerInfo(TrmpAddr));
28489
28490 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28491 DAG.getConstant(2, dl, MVT::i64));
28492 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28493 MachinePointerInfo(TrmpAddr, 2), Align(2));
28494
28495 // Load the 'nest' parameter value into R10.
28496 // R10 is specified in X86CallingConv.td
28497 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28499 DAG.getConstant(10, dl, MVT::i64));
28500 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28501 Addr, MachinePointerInfo(TrmpAddr, 10));
28502
28503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28504 DAG.getConstant(12, dl, MVT::i64));
28505 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28506 MachinePointerInfo(TrmpAddr, 12), Align(2));
28507
28508 // Jump to the nested function.
28509 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28510 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28511 DAG.getConstant(20, dl, MVT::i64));
28512 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28513 Addr, MachinePointerInfo(TrmpAddr, 20));
28514
28515 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28517 DAG.getConstant(22, dl, MVT::i64));
28518 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28519 Addr, MachinePointerInfo(TrmpAddr, 22));
28520
28521 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28522 } else {
28523 const Function *Func =
28524 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28525 CallingConv::ID CC = Func->getCallingConv();
28526 unsigned NestReg;
28527
28528 switch (CC) {
28529 default:
28530 llvm_unreachable("Unsupported calling convention");
28531 case CallingConv::C:
28533 // Pass 'nest' parameter in ECX.
28534 // Must be kept in sync with X86CallingConv.td
28535 NestReg = X86::ECX;
28536
28537 // Check that ECX wasn't needed by an 'inreg' parameter.
28538 FunctionType *FTy = Func->getFunctionType();
28539 const AttributeList &Attrs = Func->getAttributes();
28540
28541 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28542 unsigned InRegCount = 0;
28543 unsigned Idx = 0;
28544
28545 for (FunctionType::param_iterator I = FTy->param_begin(),
28546 E = FTy->param_end(); I != E; ++I, ++Idx)
28547 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28548 const DataLayout &DL = DAG.getDataLayout();
28549 // FIXME: should only count parameters that are lowered to integers.
28550 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28551 }
28552
28553 if (InRegCount > 2) {
28554 report_fatal_error("Nest register in use - reduce number of inreg"
28555 " parameters!");
28556 }
28557 }
28558 break;
28559 }
28562 case CallingConv::Fast:
28563 case CallingConv::Tail:
28565 // Pass 'nest' parameter in EAX.
28566 // Must be kept in sync with X86CallingConv.td
28567 NestReg = X86::EAX;
28568 break;
28569 }
28570
28571 SDValue OutChains[4];
28572 SDValue Addr, Disp;
28573
28574 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28575 DAG.getConstant(10, dl, MVT::i32));
28576 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28577
28578 // This is storing the opcode for MOV32ri.
28579 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28580 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28581 OutChains[0] =
28582 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28583 Trmp, MachinePointerInfo(TrmpAddr));
28584
28585 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28586 DAG.getConstant(1, dl, MVT::i32));
28587 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28588 MachinePointerInfo(TrmpAddr, 1), Align(1));
28589
28590 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28591 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28592 DAG.getConstant(5, dl, MVT::i32));
28593 OutChains[2] =
28594 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28595 MachinePointerInfo(TrmpAddr, 5), Align(1));
28596
28597 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28598 DAG.getConstant(6, dl, MVT::i32));
28599 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28600 MachinePointerInfo(TrmpAddr, 6), Align(1));
28601
28602 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28603 }
28604}
28605
28606SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28607 SelectionDAG &DAG) const {
28608 /*
28609 The rounding mode is in bits 11:10 of FPSR, and has the following
28610 settings:
28611 00 Round to nearest
28612 01 Round to -inf
28613 10 Round to +inf
28614 11 Round to 0
28615
28616 GET_ROUNDING, on the other hand, expects the following:
28617 -1 Undefined
28618 0 Round to 0
28619 1 Round to nearest
28620 2 Round to +inf
28621 3 Round to -inf
28622
28623 To perform the conversion, we use a packed lookup table of the four 2-bit
28624 values that we can index by FPSP[11:10]
28625 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28626
28627 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28628 */
28629
28630 MachineFunction &MF = DAG.getMachineFunction();
28631 MVT VT = Op.getSimpleValueType();
28632 SDLoc DL(Op);
28633
28634 // Save FP Control Word to stack slot
28635 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28636 SDValue StackSlot =
28637 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28638
28639 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28640
28641 SDValue Chain = Op.getOperand(0);
28642 SDValue Ops[] = {Chain, StackSlot};
28644 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28646
28647 // Load FP Control Word from stack slot
28648 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28649 Chain = CWD.getValue(1);
28650
28651 // Mask and turn the control bits into a shift for the lookup table.
28652 SDValue Shift =
28653 DAG.getNode(ISD::SRL, DL, MVT::i16,
28654 DAG.getNode(ISD::AND, DL, MVT::i16,
28655 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28656 DAG.getConstant(9, DL, MVT::i8));
28657 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28658
28659 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28660 SDValue RetVal =
28661 DAG.getNode(ISD::AND, DL, MVT::i32,
28662 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28663 DAG.getConstant(3, DL, MVT::i32));
28664
28665 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28666
28667 return DAG.getMergeValues({RetVal, Chain}, DL);
28668}
28669
28670SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28671 SelectionDAG &DAG) const {
28672 MachineFunction &MF = DAG.getMachineFunction();
28673 SDLoc DL(Op);
28674 SDValue Chain = Op.getNode()->getOperand(0);
28675
28676 // FP control word may be set only from data in memory. So we need to allocate
28677 // stack space to save/load FP control word.
28678 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28679 SDValue StackSlot =
28680 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28681 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28682 MachineMemOperand *MMO =
28684
28685 // Store FP control word into memory.
28686 SDValue Ops[] = {Chain, StackSlot};
28687 Chain = DAG.getMemIntrinsicNode(
28688 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28689
28690 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28691 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28692 Chain = CWD.getValue(1);
28693 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28694 DAG.getConstant(0xf3ff, DL, MVT::i16));
28695
28696 // Calculate new rounding mode.
28697 SDValue NewRM = Op.getNode()->getOperand(1);
28698 SDValue RMBits;
28699 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28700 uint64_t RM = CVal->getZExtValue();
28701 int FieldVal;
28702 switch (static_cast<RoundingMode>(RM)) {
28703 // clang-format off
28704 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28705 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28706 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28707 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28708 default:
28709 llvm_unreachable("rounding mode is not supported by X86 hardware");
28710 // clang-format on
28711 }
28712 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28713 } else {
28714 // Need to convert argument into bits of control word:
28715 // 0 Round to 0 -> 11
28716 // 1 Round to nearest -> 00
28717 // 2 Round to +inf -> 10
28718 // 3 Round to -inf -> 01
28719 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28720 // To make the conversion, put all these values into a value 0xc9 and shift
28721 // it left depending on the rounding mode:
28722 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28723 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28724 // ...
28725 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28726 SDValue ShiftValue =
28727 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28728 DAG.getNode(ISD::ADD, DL, MVT::i32,
28729 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28730 DAG.getConstant(1, DL, MVT::i8)),
28731 DAG.getConstant(4, DL, MVT::i32)));
28732 SDValue Shifted =
28733 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28734 ShiftValue);
28735 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28736 DAG.getConstant(0xc00, DL, MVT::i16));
28737 }
28738
28739 // Update rounding mode bits and store the new FP Control Word into stack.
28740 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28741 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28742
28743 // Load FP control word from the slot.
28744 SDValue OpsLD[] = {Chain, StackSlot};
28745 MachineMemOperand *MMOL =
28747 Chain = DAG.getMemIntrinsicNode(
28748 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28749
28750 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28751 // same way but in bits 14:13.
28752 if (Subtarget.hasSSE1()) {
28753 // Store MXCSR into memory.
28754 Chain = DAG.getNode(
28755 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28756 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28757 StackSlot);
28758
28759 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28760 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28761 Chain = CWD.getValue(1);
28762 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28763 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28764
28765 // Shift X87 RM bits from 11:10 to 14:13.
28766 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28767 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28768 DAG.getConstant(3, DL, MVT::i8));
28769
28770 // Update rounding mode bits and store the new FP Control Word into stack.
28771 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28772 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28773
28774 // Load MXCSR from the slot.
28775 Chain = DAG.getNode(
28776 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28777 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28778 StackSlot);
28779 }
28780
28781 return Chain;
28782}
28783
28784const unsigned X87StateSize = 28;
28785const unsigned FPStateSize = 32;
28786[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28787
28788SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28789 SelectionDAG &DAG) const {
28791 SDLoc DL(Op);
28792 SDValue Chain = Op->getOperand(0);
28793 SDValue Ptr = Op->getOperand(1);
28795 EVT MemVT = Node->getMemoryVT();
28797 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28798
28799 // Get x87 state, if it presents.
28800 if (Subtarget.hasX87()) {
28801 Chain =
28802 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28803 {Chain, Ptr}, MemVT, MMO);
28804
28805 // FNSTENV changes the exception mask, so load back the stored environment.
28806 MachineMemOperand::Flags NewFlags =
28809 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28810 Chain =
28811 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28812 {Chain, Ptr}, MemVT, MMO);
28813 }
28814
28815 // If target supports SSE, get MXCSR as well.
28816 if (Subtarget.hasSSE1()) {
28817 // Get pointer to the MXCSR location in memory.
28819 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28820 DAG.getConstant(X87StateSize, DL, PtrVT));
28821 // Store MXCSR into memory.
28822 Chain = DAG.getNode(
28823 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28824 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28825 MXCSRAddr);
28826 }
28827
28828 return Chain;
28829}
28830
28832 EVT MemVT, MachineMemOperand *MMO,
28833 SelectionDAG &DAG,
28834 const X86Subtarget &Subtarget) {
28835 // Set x87 state, if it presents.
28836 if (Subtarget.hasX87())
28837 Chain =
28838 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28839 {Chain, Ptr}, MemVT, MMO);
28840 // If target supports SSE, set MXCSR as well.
28841 if (Subtarget.hasSSE1()) {
28842 // Get pointer to the MXCSR location in memory.
28844 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28845 DAG.getConstant(X87StateSize, DL, PtrVT));
28846 // Load MXCSR from memory.
28847 Chain = DAG.getNode(
28848 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28849 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28850 MXCSRAddr);
28851 }
28852 return Chain;
28853}
28854
28855SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28856 SelectionDAG &DAG) const {
28857 SDLoc DL(Op);
28858 SDValue Chain = Op->getOperand(0);
28859 SDValue Ptr = Op->getOperand(1);
28861 EVT MemVT = Node->getMemoryVT();
28863 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28864 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28865}
28866
28867SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28868 SelectionDAG &DAG) const {
28869 MachineFunction &MF = DAG.getMachineFunction();
28870 SDLoc DL(Op);
28871 SDValue Chain = Op.getNode()->getOperand(0);
28872
28873 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28874 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28876
28877 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28878 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28879 // for compatibility with glibc.
28880 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28881 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28882 Constant *Zero = ConstantInt::get(ItemTy, 0);
28883 for (unsigned I = 0; I < 6; ++I)
28884 FPEnvVals.push_back(Zero);
28885
28886 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28887 // all exceptions, sets DAZ and FTZ to 0.
28888 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28889 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28890 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28891 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28892 MachinePointerInfo MPI =
28894 MachineMemOperand *MMO = MF.getMachineMemOperand(
28896
28897 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28898}
28899
28900// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28901uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28902 assert((Amt < 8) && "Shift/Rotation amount out of range");
28903 switch (Opcode) {
28904 case ISD::BITREVERSE:
28905 return 0x8040201008040201ULL;
28906 case ISD::SHL:
28907 return ((0x0102040810204080ULL >> (Amt)) &
28908 (0x0101010101010101ULL * (0xFF >> (Amt))));
28909 case ISD::SRL:
28910 return ((0x0102040810204080ULL << (Amt)) &
28911 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28912 case ISD::SRA:
28913 return (getGFNICtrlImm(ISD::SRL, Amt) |
28914 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28915 case ISD::ROTL:
28916 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28917 case ISD::ROTR:
28918 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28919 }
28920 llvm_unreachable("Unsupported GFNI opcode");
28921}
28922
28923// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28924SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28925 MVT VT, unsigned Amt = 0) {
28926 assert(VT.getVectorElementType() == MVT::i8 &&
28927 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28928 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28929 SmallVector<SDValue> MaskBits;
28930 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28931 uint64_t Bits = (Imm >> (I % 64)) & 255;
28932 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28933 }
28934 return DAG.getBuildVector(VT, DL, MaskBits);
28935}
28936
28937/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28938//
28939// i8/i16 vector implemented using dword LZCNT vector instruction
28940// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28941// split the vector, perform operation on it's Lo a Hi part and
28942// concatenate the results.
28944 const X86Subtarget &Subtarget) {
28945 assert(Op.getOpcode() == ISD::CTLZ);
28946 SDLoc dl(Op);
28947 MVT VT = Op.getSimpleValueType();
28948 MVT EltVT = VT.getVectorElementType();
28949 unsigned NumElems = VT.getVectorNumElements();
28950
28951 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28952 "Unsupported element type");
28953
28954 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28955 if (NumElems > 16 ||
28956 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28957 return splitVectorIntUnary(Op, DAG, dl);
28958
28959 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28960 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28961 "Unsupported value type for operation");
28962
28963 // Use native supported vector instruction vplzcntd.
28964 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28965 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28966 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28967 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28968
28969 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28970}
28971
28972// Lower CTLZ using a PSHUFB lookup table implementation.
28974 const X86Subtarget &Subtarget,
28975 SelectionDAG &DAG) {
28976 MVT VT = Op.getSimpleValueType();
28977 int NumElts = VT.getVectorNumElements();
28978 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28979 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28980
28981 // Per-nibble leading zero PSHUFB lookup table.
28982 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28983 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28984 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28985 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28986
28988 for (int i = 0; i < NumBytes; ++i)
28989 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28990 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28991
28992 // Begin by bitcasting the input to byte vector, then split those bytes
28993 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28994 // If the hi input nibble is zero then we add both results together, otherwise
28995 // we just take the hi result (by masking the lo result to zero before the
28996 // add).
28997 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28998 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28999
29000 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29001 SDValue Lo = Op0;
29002 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29003 SDValue HiZ;
29004 if (CurrVT.is512BitVector()) {
29005 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29006 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29007 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29008 } else {
29009 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29010 }
29011
29012 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29013 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29014 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29015 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29016
29017 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29018 // of the current vector width in the same way we did for the nibbles.
29019 // If the upper half of the input element is zero then add the halves'
29020 // leading zero counts together, otherwise just use the upper half's.
29021 // Double the width of the result until we are at target width.
29022 while (CurrVT != VT) {
29023 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29024 int CurrNumElts = CurrVT.getVectorNumElements();
29025 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29026 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29027 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29028
29029 // Check if the upper half of the input element is zero.
29030 if (CurrVT.is512BitVector()) {
29031 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29032 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29033 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29034 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29035 } else {
29036 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29037 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29038 }
29039 HiZ = DAG.getBitcast(NextVT, HiZ);
29040
29041 // Move the upper/lower halves to the lower bits as we'll be extending to
29042 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29043 // together.
29044 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29045 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29046 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29047 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29048 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29049 CurrVT = NextVT;
29050 }
29051
29052 return Res;
29053}
29054
29056 const X86Subtarget &Subtarget,
29057 SelectionDAG &DAG) {
29058 MVT VT = Op.getSimpleValueType();
29059
29060 if (Subtarget.hasCDI() &&
29061 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29062 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29063 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29064
29065 // Decompose 256-bit ops into smaller 128-bit ops.
29066 if (VT.is256BitVector() && !Subtarget.hasInt256())
29067 return splitVectorIntUnary(Op, DAG, DL);
29068
29069 // Decompose 512-bit ops into smaller 256-bit ops.
29070 if (VT.is512BitVector() && !Subtarget.hasBWI())
29071 return splitVectorIntUnary(Op, DAG, DL);
29072
29073 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29074 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29075}
29076
29078 SelectionDAG &DAG,
29079 const X86Subtarget &Subtarget) {
29080 MVT VT = Op.getSimpleValueType();
29081 SDValue Input = Op.getOperand(0);
29082
29083 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29084 "Expected vXi8 input for GFNI-based CTLZ lowering");
29085
29086 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29087
29088 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29089 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29090
29091 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29092 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29093 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29094
29095 SDValue LZCNT =
29096 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29097 DAG.getTargetConstant(8, DL, MVT::i8));
29098 return LZCNT;
29099}
29100
29101static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29102 SelectionDAG &DAG) {
29103 MVT VT = Op.getSimpleValueType();
29104 MVT OpVT = VT;
29105 unsigned NumBits = VT.getSizeInBits();
29106 SDLoc dl(Op);
29107 unsigned Opc = Op.getOpcode();
29108
29109 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29110 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29111
29112 if (VT.isVector())
29113 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29114
29115 Op = Op.getOperand(0);
29116 if (VT == MVT::i8) {
29117 // Zero extend to i32 since there is not an i8 bsr.
29118 OpVT = MVT::i32;
29119 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29120 }
29121
29122 // Check if we can safely pass a result though BSR for zero sources.
29123 SDValue PassThru = DAG.getUNDEF(OpVT);
29124 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29125 !DAG.isKnownNeverZero(Op))
29126 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29127
29128 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29129 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29130 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29131
29132 // Skip CMOV if we're using a pass through value.
29133 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29134 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29135 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29136 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29137 Op.getValue(1)};
29138 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29139 }
29140
29141 // Finally xor with NumBits-1.
29142 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29143 DAG.getConstant(NumBits - 1, dl, OpVT));
29144
29145 if (VT == MVT::i8)
29146 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29147 return Op;
29148}
29149
29150static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29151 SelectionDAG &DAG) {
29152 MVT VT = Op.getSimpleValueType();
29153 unsigned NumBits = VT.getScalarSizeInBits();
29154 SDValue N0 = Op.getOperand(0);
29155 SDLoc dl(Op);
29156 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29157
29158 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29159 "Only scalar CTTZ requires custom lowering");
29160
29161 // Check if we can safely pass a result though BSF for zero sources.
29162 SDValue PassThru = DAG.getUNDEF(VT);
29163 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29164 PassThru = DAG.getConstant(NumBits, dl, VT);
29165
29166 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29167 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29168 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29169
29170 // Skip CMOV if src is never zero or we're using a pass through value.
29171 if (NonZeroSrc || !PassThru.isUndef())
29172 return Op;
29173
29174 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29175 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29176 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29177 Op.getValue(1)};
29178 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29179}
29180
29182 const X86Subtarget &Subtarget) {
29183 MVT VT = Op.getSimpleValueType();
29184 SDLoc DL(Op);
29185
29186 if (VT == MVT::i16 || VT == MVT::i32)
29187 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29188
29189 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29190 return splitVectorIntBinary(Op, DAG, DL);
29191
29192 assert(Op.getSimpleValueType().is256BitVector() &&
29193 Op.getSimpleValueType().isInteger() &&
29194 "Only handle AVX 256-bit vector integer operation");
29195 return splitVectorIntBinary(Op, DAG, DL);
29196}
29197
29199 const X86Subtarget &Subtarget) {
29200 MVT VT = Op.getSimpleValueType();
29201 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29202 unsigned Opcode = Op.getOpcode();
29203 SDLoc DL(Op);
29204
29205 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29206 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29207 assert(Op.getSimpleValueType().isInteger() &&
29208 "Only handle AVX vector integer operation");
29209 return splitVectorIntBinary(Op, DAG, DL);
29210 }
29211
29212 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29213 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29214 EVT SetCCResultType =
29215 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29216
29217 unsigned BitWidth = VT.getScalarSizeInBits();
29218 if (Opcode == ISD::USUBSAT) {
29219 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29220 // Handle a special-case with a bit-hack instead of cmp+select:
29221 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29222 // If the target can use VPTERNLOG, DAGToDAG will match this as
29223 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29224 // "broadcast" constant load.
29226 if (C && C->getAPIntValue().isSignMask()) {
29227 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29228 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29229 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29230 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29231 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29232 }
29233 }
29234 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29235 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29236 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29237 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29238 // TODO: Move this to DAGCombiner?
29239 if (SetCCResultType == VT &&
29240 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29241 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29242 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29243 }
29244 }
29245
29246 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29247 (!VT.isVector() || VT == MVT::v2i64)) {
29250 SDValue Zero = DAG.getConstant(0, DL, VT);
29251 SDValue Result =
29252 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29253 DAG.getVTList(VT, SetCCResultType), X, Y);
29254 SDValue SumDiff = Result.getValue(0);
29255 SDValue Overflow = Result.getValue(1);
29256 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29257 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29258 SDValue SumNeg =
29259 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29260 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29261 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29262 }
29263
29264 // Use default expansion.
29265 return SDValue();
29266}
29267
29268static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29269 SelectionDAG &DAG) {
29270 MVT VT = Op.getSimpleValueType();
29271 SDLoc DL(Op);
29272
29273 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29274 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29275 // 8-bit integer abs to NEG and CMOV.
29276 SDValue N0 = Op.getOperand(0);
29277 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29278 DAG.getConstant(0, DL, VT), N0);
29279 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29280 SDValue(Neg.getNode(), 1)};
29281 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29282 }
29283
29284 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29285 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29286 SDValue Src = Op.getOperand(0);
29287 SDValue Neg = DAG.getNegative(Src, DL, VT);
29288 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29289 }
29290
29291 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29292 assert(VT.isInteger() &&
29293 "Only handle AVX 256-bit vector integer operation");
29294 return splitVectorIntUnary(Op, DAG, DL);
29295 }
29296
29297 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29298 return splitVectorIntUnary(Op, DAG, DL);
29299
29300 // Default to expand.
29301 return SDValue();
29302}
29303
29304static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29305 SelectionDAG &DAG) {
29306 MVT VT = Op.getSimpleValueType();
29307 SDLoc DL(Op);
29308
29309 // For AVX1 cases, split to use legal ops.
29310 if (VT.is256BitVector() && !Subtarget.hasInt256())
29311 return splitVectorIntBinary(Op, DAG, DL);
29312
29313 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29314 return splitVectorIntBinary(Op, DAG, DL);
29315
29316 // Default to expand.
29317 return SDValue();
29318}
29319
29320static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29321 SelectionDAG &DAG) {
29322 MVT VT = Op.getSimpleValueType();
29323 SDLoc DL(Op);
29324
29325 // For AVX1 cases, split to use legal ops.
29326 if (VT.is256BitVector() && !Subtarget.hasInt256())
29327 return splitVectorIntBinary(Op, DAG, DL);
29328
29329 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29330 return splitVectorIntBinary(Op, DAG, DL);
29331
29332 // Default to expand.
29333 return SDValue();
29334}
29335
29337 SelectionDAG &DAG) {
29338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29339 EVT VT = Op.getValueType();
29340 SDValue X = Op.getOperand(0);
29341 SDValue Y = Op.getOperand(1);
29342 SDLoc DL(Op);
29343 bool IsMaxOp =
29344 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29345 bool IsNum =
29346 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29347 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29348 unsigned Opc = 0;
29349 if (VT.isVector())
29351 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29353
29354 if (Opc) {
29355 SDValue Imm =
29356 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29357 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29358 }
29359 }
29360
29361 uint64_t SizeInBits = VT.getScalarSizeInBits();
29362 APInt PreferredZero = APInt::getZero(SizeInBits);
29363 APInt OppositeZero = PreferredZero;
29364 EVT IVT = VT.changeTypeToInteger();
29365 X86ISD::NodeType MinMaxOp;
29366 if (IsMaxOp) {
29367 MinMaxOp = X86ISD::FMAX;
29368 OppositeZero.setSignBit();
29369 } else {
29370 PreferredZero.setSignBit();
29371 MinMaxOp = X86ISD::FMIN;
29372 }
29373 EVT SetCCType =
29374 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29375
29376 // The tables below show the expected result of Max in cases of NaN and
29377 // signed zeros.
29378 //
29379 // Y Y
29380 // Num xNaN +0 -0
29381 // --------------- ---------------
29382 // Num | Max | Y | +0 | +0 | +0 |
29383 // X --------------- X ---------------
29384 // xNaN | X | X/Y | -0 | +0 | -0 |
29385 // --------------- ---------------
29386 //
29387 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29388 // reordering.
29389 //
29390 // We check if any of operands is NaN and return NaN. Then we check if any of
29391 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29392 // to ensure the correct zero is returned.
29393 auto MatchesZero = [](SDValue Op, APInt Zero) {
29395 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29396 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29397 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29398 return CstOp->getAPIntValue() == Zero;
29399 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29400 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29401 for (const SDValue &OpVal : Op->op_values()) {
29402 if (OpVal.isUndef())
29403 continue;
29404 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29405 if (!CstOp)
29406 return false;
29407 if (!CstOp->getValueAPF().isZero())
29408 continue;
29409 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29410 return false;
29411 }
29412 return true;
29413 }
29414 return false;
29415 };
29416
29417 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29418 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29419 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29420 Op->getFlags().hasNoSignedZeros() ||
29421 DAG.isKnownNeverZeroFloat(X) ||
29423 SDValue NewX, NewY;
29424 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29425 MatchesZero(X, OppositeZero)) {
29426 // Operands are already in right order or order does not matter.
29427 NewX = X;
29428 NewY = Y;
29429 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29430 NewX = Y;
29431 NewY = X;
29432 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29433 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29434 if (IsXNeverNaN)
29435 std::swap(X, Y);
29436 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29437 // xmm register.
29438 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29440 // Bits of classes:
29441 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29442 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29443 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29444 DL, MVT::i32);
29445 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29446 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29447 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29448 DAG.getVectorIdxConstant(0, DL));
29449 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29450 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29451 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29452 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29453 } else {
29454 SDValue IsXSigned;
29455 if (Subtarget.is64Bit() || VT != MVT::f64) {
29456 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29457 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29458 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29459 } else {
29460 assert(VT == MVT::f64);
29461 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29462 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29463 DAG.getVectorIdxConstant(0, DL));
29464 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29465 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29466 DAG.getVectorIdxConstant(1, DL));
29467 Hi = DAG.getBitcast(MVT::i32, Hi);
29468 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29469 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29470 *DAG.getContext(), MVT::i32);
29471 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29472 }
29473 if (MinMaxOp == X86ISD::FMAX) {
29474 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29475 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29476 } else {
29477 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29478 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29479 }
29480 }
29481
29482 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29483 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29484
29485 // If we did no ordering operands for signed zero handling and we need
29486 // to process NaN and we know that one of the operands is not NaN then:
29487 // - For minimum/maximum, put it in the first operand,
29488 // - For minimumnum/maximumnum, put it in the second operand,
29489 // and we will not need to post handle NaN after max/min.
29490 if (IgnoreSignedZero && !IgnoreNaN &&
29491 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29492 std::swap(NewX, NewY);
29493
29494 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29495
29496 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29497 return MinMax;
29498
29499 if (DAG.isKnownNeverNaN(NewX))
29500 NewX = NewY;
29501
29502 SDValue IsNaN =
29503 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29504
29505 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29506}
29507
29508static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29509 SelectionDAG &DAG) {
29510 MVT VT = Op.getSimpleValueType();
29511 SDLoc dl(Op);
29512
29513 // For AVX1 cases, split to use legal ops.
29514 if (VT.is256BitVector() && !Subtarget.hasInt256())
29515 return splitVectorIntBinary(Op, DAG, dl);
29516
29517 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29518 return splitVectorIntBinary(Op, DAG, dl);
29519
29520 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29521 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29522
29523 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29524 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29525 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29526
29527 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29528 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29529 if (VT.bitsGE(MVT::i32)) {
29530 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29531 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29532 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29533 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29534 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29535 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29536 DAG.getTargetConstant(CC, dl, MVT::i8),
29537 Diff1.getValue(1));
29538 }
29539
29540 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29541 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29542 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29543 MVT WideVT = MVT::getIntegerVT(WideBits);
29544 if (TLI.isTypeLegal(WideVT)) {
29545 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29546 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29547 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29548 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29549 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29550 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29551 DAG.getTargetConstant(CC, dl, MVT::i8),
29552 Diff1.getValue(1));
29553 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29554 }
29555 }
29556
29557 // Default to expand.
29558 return SDValue();
29559}
29560
29561static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29562 SelectionDAG &DAG) {
29563 SDLoc dl(Op);
29564 MVT VT = Op.getSimpleValueType();
29565
29566 // Decompose 256-bit ops into 128-bit ops.
29567 if (VT.is256BitVector() && !Subtarget.hasInt256())
29568 return splitVectorIntBinary(Op, DAG, dl);
29569
29570 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29571 return splitVectorIntBinary(Op, DAG, dl);
29572
29573 SDValue A = Op.getOperand(0);
29574 SDValue B = Op.getOperand(1);
29575
29576 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29577 // vector pairs, multiply and truncate.
29578 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29579 unsigned NumElts = VT.getVectorNumElements();
29580 unsigned NumLanes = VT.getSizeInBits() / 128;
29581 unsigned NumEltsPerLane = NumElts / NumLanes;
29582
29583 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29584 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29585 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29586 return DAG.getNode(
29587 ISD::TRUNCATE, dl, VT,
29588 DAG.getNode(ISD::MUL, dl, ExVT,
29589 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29590 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29591 }
29592
29593 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29594
29595 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29596 // Don't do this if we only need to unpack one half.
29597 if (Subtarget.hasSSSE3()) {
29598 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29599 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29600 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29601 if (BIsBuildVector) {
29602 for (auto [Idx, Val] : enumerate(B->ops())) {
29603 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29604 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29605 else
29606 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29607 }
29608 }
29609 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29610 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29611 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29612 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29613 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29614 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29615 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29616 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29617 DAG.getTargetConstant(8, dl, MVT::i8));
29618 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29619 }
29620 }
29621
29622 // Extract the lo/hi parts to any extend to i16.
29623 // We're going to mask off the low byte of each result element of the
29624 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29625 // element.
29626 SDValue Undef = DAG.getUNDEF(VT);
29627 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29628 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29629
29630 SDValue BLo, BHi;
29631 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29632 // If the RHS is a constant, manually unpackl/unpackh.
29633 SmallVector<SDValue, 16> LoOps, HiOps;
29634 for (unsigned i = 0; i != NumElts; i += 16) {
29635 for (unsigned j = 0; j != 8; ++j) {
29636 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29637 MVT::i16));
29638 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29639 MVT::i16));
29640 }
29641 }
29642
29643 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29644 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29645 } else {
29646 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29647 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29648 }
29649
29650 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29651 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29652 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29653 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29654 }
29655
29656 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29657 if (VT == MVT::v4i32) {
29658 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29659 "Should not custom lower when pmulld is available!");
29660
29661 // Extract the odd parts.
29662 static const int UnpackMask[] = {1, 1, 3, 3};
29663 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29664 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29665
29666 // Multiply the even parts.
29667 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29668 DAG.getBitcast(MVT::v2i64, A),
29669 DAG.getBitcast(MVT::v2i64, B));
29670 // Now multiply odd parts.
29671 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29672 DAG.getBitcast(MVT::v2i64, Aodds),
29673 DAG.getBitcast(MVT::v2i64, Bodds));
29674
29675 Evens = DAG.getBitcast(VT, Evens);
29676 Odds = DAG.getBitcast(VT, Odds);
29677
29678 // Merge the two vectors back together with a shuffle. This expands into 2
29679 // shuffles.
29680 static const int ShufMask[] = { 0, 4, 2, 6 };
29681 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29682 }
29683
29684 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29685 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29686 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29687
29688 // Ahi = psrlqi(a, 32);
29689 // Bhi = psrlqi(b, 32);
29690 //
29691 // AloBlo = pmuludq(a, b);
29692 // AloBhi = pmuludq(a, Bhi);
29693 // AhiBlo = pmuludq(Ahi, b);
29694 //
29695 // Hi = psllqi(AloBhi + AhiBlo, 32);
29696 // return AloBlo + Hi;
29697 KnownBits AKnown = DAG.computeKnownBits(A);
29698 KnownBits BKnown = DAG.computeKnownBits(B);
29699
29700 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29701 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29702 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29703
29704 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29705 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29706 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29707
29708 SDValue Zero = DAG.getConstant(0, dl, VT);
29709
29710 // Only multiply lo/hi halves that aren't known to be zero.
29711 SDValue AloBlo = Zero;
29712 if (!ALoIsZero && !BLoIsZero)
29713 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29714
29715 SDValue AloBhi = Zero;
29716 if (!ALoIsZero && !BHiIsZero) {
29717 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29718 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29719 }
29720
29721 SDValue AhiBlo = Zero;
29722 if (!AHiIsZero && !BLoIsZero) {
29723 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29724 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29725 }
29726
29727 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29728 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29729
29730 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29731}
29732
29734 MVT VT, bool IsSigned,
29735 const X86Subtarget &Subtarget,
29736 SelectionDAG &DAG,
29737 SDValue *Low = nullptr) {
29738 unsigned NumElts = VT.getVectorNumElements();
29739
29740 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29741 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29742 // lane results back together.
29743
29744 // We'll take different approaches for signed and unsigned.
29745 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29746 // and use pmullw to calculate the full 16-bit product.
29747 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29748 // shift them left into the upper byte of each word. This allows us to use
29749 // pmulhw to calculate the full 16-bit product. This trick means we don't
29750 // need to sign extend the bytes to use pmullw.
29751
29752 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29753 SDValue Zero = DAG.getConstant(0, dl, VT);
29754
29755 SDValue ALo, AHi;
29756 if (IsSigned) {
29757 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29758 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29759 } else {
29760 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29761 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29762 }
29763
29764 SDValue BLo, BHi;
29765 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29766 // If the RHS is a constant, manually unpackl/unpackh and extend.
29767 SmallVector<SDValue, 16> LoOps, HiOps;
29768 for (unsigned i = 0; i != NumElts; i += 16) {
29769 for (unsigned j = 0; j != 8; ++j) {
29770 SDValue LoOp = B.getOperand(i + j);
29771 SDValue HiOp = B.getOperand(i + j + 8);
29772
29773 if (IsSigned) {
29774 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29775 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29776 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29777 DAG.getConstant(8, dl, MVT::i16));
29778 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29779 DAG.getConstant(8, dl, MVT::i16));
29780 } else {
29781 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29782 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29783 }
29784
29785 LoOps.push_back(LoOp);
29786 HiOps.push_back(HiOp);
29787 }
29788 }
29789
29790 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29791 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29792 } else if (IsSigned) {
29793 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29794 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29795 } else {
29796 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29797 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29798 }
29799
29800 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29801 // pack back to vXi8.
29802 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29803 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29804 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29805
29806 if (Low)
29807 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29808
29809 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29810}
29811
29812static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29813 SelectionDAG &DAG) {
29814 SDLoc dl(Op);
29815 MVT VT = Op.getSimpleValueType();
29816 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29817 unsigned NumElts = VT.getVectorNumElements();
29818 SDValue A = Op.getOperand(0);
29819 SDValue B = Op.getOperand(1);
29820
29821 // Decompose 256-bit ops into 128-bit ops.
29822 if (VT.is256BitVector() && !Subtarget.hasInt256())
29823 return splitVectorIntBinary(Op, DAG, dl);
29824
29825 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29826 return splitVectorIntBinary(Op, DAG, dl);
29827
29828 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29829 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29830 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29831 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29832
29833 // PMULxD operations multiply each even value (starting at 0) of LHS with
29834 // the related value of RHS and produce a widen result.
29835 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29836 // => <2 x i64> <ae|cg>
29837 //
29838 // In other word, to have all the results, we need to perform two PMULxD:
29839 // 1. one with the even values.
29840 // 2. one with the odd values.
29841 // To achieve #2, with need to place the odd values at an even position.
29842 //
29843 // Place the odd value at an even position (basically, shift all values 1
29844 // step to the left):
29845 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29846 9, -1, 11, -1, 13, -1, 15, -1};
29847 // <a|b|c|d> => <b|undef|d|undef>
29848 SDValue Odd0 =
29849 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29850 // <e|f|g|h> => <f|undef|h|undef>
29851 SDValue Odd1 =
29852 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29853
29854 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29855 // ints.
29856 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29857 unsigned Opcode =
29858 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29859 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29860 // => <2 x i64> <ae|cg>
29861 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29862 DAG.getBitcast(MulVT, A),
29863 DAG.getBitcast(MulVT, B)));
29864 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29865 // => <2 x i64> <bf|dh>
29866 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29867 DAG.getBitcast(MulVT, Odd0),
29868 DAG.getBitcast(MulVT, Odd1)));
29869
29870 // Shuffle it back into the right order.
29871 SmallVector<int, 16> ShufMask(NumElts);
29872 for (int i = 0; i != (int)NumElts; ++i)
29873 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29874
29875 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29876
29877 // If we have a signed multiply but no PMULDQ fix up the result of an
29878 // unsigned multiply.
29879 if (IsSigned && !Subtarget.hasSSE41()) {
29880 SDValue Zero = DAG.getConstant(0, dl, VT);
29881 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29882 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29883 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29884 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29885
29886 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29887 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29888 }
29889
29890 return Res;
29891 }
29892
29893 // Only i8 vectors should need custom lowering after this.
29894 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29895 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29896 "Unsupported vector type");
29897
29898 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29899 // logical shift down the upper half and pack back to i8.
29900
29901 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29902 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29903
29904 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29905 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29906 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29907 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29908 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29909 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29910 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29911 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29912 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29913 }
29914
29915 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29916}
29917
29918// Custom lowering for SMULO/UMULO.
29919static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29920 SelectionDAG &DAG) {
29921 MVT VT = Op.getSimpleValueType();
29922
29923 // Scalars defer to LowerXALUO.
29924 if (!VT.isVector())
29925 return LowerXALUO(Op, DAG);
29926
29927 SDLoc dl(Op);
29928 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29929 SDValue A = Op.getOperand(0);
29930 SDValue B = Op.getOperand(1);
29931 EVT OvfVT = Op->getValueType(1);
29932
29933 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29934 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29935 // Extract the LHS Lo/Hi vectors
29936 SDValue LHSLo, LHSHi;
29937 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29938
29939 // Extract the RHS Lo/Hi vectors
29940 SDValue RHSLo, RHSHi;
29941 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29942
29943 EVT LoOvfVT, HiOvfVT;
29944 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29945 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29946 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29947
29948 // Issue the split operations.
29949 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29950 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29951
29952 // Join the separate data results and the overflow results.
29953 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29954 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29955 Hi.getValue(1));
29956
29957 return DAG.getMergeValues({Res, Ovf}, dl);
29958 }
29959
29960 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29961 EVT SetccVT =
29962 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29963
29964 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29965 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29966 unsigned NumElts = VT.getVectorNumElements();
29967 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29968 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29969 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29970 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29971 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29972
29973 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29974
29975 SDValue Ovf;
29976 if (IsSigned) {
29977 SDValue High, LowSign;
29978 if (OvfVT.getVectorElementType() == MVT::i1 &&
29979 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29980 // Rather the truncating try to do the compare on vXi16 or vXi32.
29981 // Shift the high down filling with sign bits.
29982 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29983 // Fill all 16 bits with the sign bit from the low.
29984 LowSign =
29985 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29986 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29987 15, DAG);
29988 SetccVT = OvfVT;
29989 if (!Subtarget.hasBWI()) {
29990 // We can't do a vXi16 compare so sign extend to v16i32.
29991 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29992 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29993 }
29994 } else {
29995 // Otherwise do the compare at vXi8.
29996 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29997 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29998 LowSign =
29999 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30000 }
30001
30002 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30003 } else {
30004 SDValue High =
30005 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30006 if (OvfVT.getVectorElementType() == MVT::i1 &&
30007 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30008 // Rather the truncating try to do the compare on vXi16 or vXi32.
30009 SetccVT = OvfVT;
30010 if (!Subtarget.hasBWI()) {
30011 // We can't do a vXi16 compare so sign extend to v16i32.
30012 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30013 }
30014 } else {
30015 // Otherwise do the compare at vXi8.
30016 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30017 }
30018
30019 Ovf =
30020 DAG.getSetCC(dl, SetccVT, High,
30021 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30022 }
30023
30024 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30025
30026 return DAG.getMergeValues({Low, Ovf}, dl);
30027 }
30028
30029 SDValue Low;
30030 SDValue High =
30031 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30032
30033 SDValue Ovf;
30034 if (IsSigned) {
30035 // SMULO overflows if the high bits don't match the sign of the low.
30036 SDValue LowSign =
30037 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30038 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30039 } else {
30040 // UMULO overflows if the high bits are non-zero.
30041 Ovf =
30042 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30043 }
30044
30045 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30046
30047 return DAG.getMergeValues({Low, Ovf}, dl);
30048}
30049
30050SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30051 assert(Subtarget.isTargetWin64() && "Unexpected target");
30052 EVT VT = Op.getValueType();
30053 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30054 "Unexpected return type for lowering");
30055
30056 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30058 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30059 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30060 }
30061
30062 RTLIB::Libcall LC;
30063 bool isSigned;
30064 switch (Op->getOpcode()) {
30065 // clang-format off
30066 default: llvm_unreachable("Unexpected request for libcall!");
30067 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30068 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30069 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30070 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30071 // clang-format on
30072 }
30073
30074 SDLoc dl(Op);
30075 SDValue InChain = DAG.getEntryNode();
30076
30078 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30079 EVT ArgVT = Op->getOperand(i).getValueType();
30080 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30081 "Unexpected argument type for lowering");
30082 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30083 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30084 MachinePointerInfo MPI =
30086 InChain =
30087 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30088 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30089 }
30090
30093
30094 TargetLowering::CallLoweringInfo CLI(DAG);
30095 CLI.setDebugLoc(dl)
30096 .setChain(InChain)
30097 .setLibCallee(
30099 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30100 std::move(Args))
30101 .setInRegister()
30102 .setSExtResult(isSigned)
30103 .setZExtResult(!isSigned);
30104
30105 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30106 return DAG.getBitcast(VT, CallInfo.first);
30107}
30108
30109SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30110 SelectionDAG &DAG,
30111 SDValue &Chain) const {
30112 assert(Subtarget.isTargetWin64() && "Unexpected target");
30113 EVT VT = Op.getValueType();
30114 bool IsStrict = Op->isStrictFPOpcode();
30115
30116 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30117 EVT ArgVT = Arg.getValueType();
30118
30119 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30120 "Unexpected return type for lowering");
30121
30122 RTLIB::Libcall LC;
30123 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30124 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30125 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30126 else
30127 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30128 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30129
30130 SDLoc dl(Op);
30131 MakeLibCallOptions CallOptions;
30132 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30133
30135 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30136 // expected VT (i128).
30137 std::tie(Result, Chain) =
30138 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30139 Result = DAG.getBitcast(VT, Result);
30140 return Result;
30141}
30142
30143SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30144 SelectionDAG &DAG) const {
30145 assert(Subtarget.isTargetWin64() && "Unexpected target");
30146 EVT VT = Op.getValueType();
30147 bool IsStrict = Op->isStrictFPOpcode();
30148
30149 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30150 EVT ArgVT = Arg.getValueType();
30151
30152 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30153 "Unexpected argument type for lowering");
30154
30155 RTLIB::Libcall LC;
30156 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30157 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30158 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30159 else
30160 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30161 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30162
30163 SDLoc dl(Op);
30164 MakeLibCallOptions CallOptions;
30165 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30166
30167 // Pass the i128 argument as an indirect argument on the stack.
30168 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30169 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30170 MachinePointerInfo MPI =
30172 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30173
30175 std::tie(Result, Chain) =
30176 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30177 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30178}
30179
30180// Return true if the required (according to Opcode) shift-imm form is natively
30181// supported by the Subtarget
30182static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30183 unsigned Opcode) {
30184 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30185 "Unexpected shift opcode");
30186
30187 if (!VT.isSimple())
30188 return false;
30189
30190 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30191 return false;
30192
30193 if (VT.getScalarSizeInBits() < 16)
30194 return false;
30195
30196 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30197 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30198 return true;
30199
30200 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30201 (VT.is256BitVector() && Subtarget.hasInt256());
30202
30203 bool AShift = LShift && (Subtarget.hasAVX512() ||
30204 (VT != MVT::v2i64 && VT != MVT::v4i64));
30205 return (Opcode == ISD::SRA) ? AShift : LShift;
30206}
30207
30208// The shift amount is a variable, but it is the same for all vector lanes.
30209// These instructions are defined together with shift-immediate.
30210static
30212 unsigned Opcode) {
30213 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30214}
30215
30216// Return true if the required (according to Opcode) variable-shift form is
30217// natively supported by the Subtarget
30218static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30219 unsigned Opcode) {
30220 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30221 "Unexpected shift opcode");
30222
30223 if (!VT.isSimple())
30224 return false;
30225
30226 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30227 return false;
30228
30229 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30230 return false;
30231
30232 // vXi16 supported only on AVX-512, BWI
30233 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30234 return false;
30235
30236 if (Subtarget.hasAVX512() &&
30237 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30238 return true;
30239
30240 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30241 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30242 return (Opcode == ISD::SRA) ? AShift : LShift;
30243}
30244
30246 const X86Subtarget &Subtarget) {
30247 MVT VT = Op.getSimpleValueType();
30248 SDLoc dl(Op);
30249 SDValue R = Op.getOperand(0);
30250 SDValue Amt = Op.getOperand(1);
30251 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30252 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30253
30254 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30255 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30256 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30257 SDValue Ex = DAG.getBitcast(ExVT, R);
30258
30259 // ashr(R, 63) === cmp_slt(R, 0)
30260 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30261 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30262 "Unsupported PCMPGT op");
30263 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30264 }
30265
30266 if (ShiftAmt >= 32) {
30267 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30268 SDValue Upper =
30269 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30271 ShiftAmt - 32, DAG);
30272 if (VT == MVT::v2i64)
30273 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30274 if (VT == MVT::v4i64)
30275 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30276 {9, 1, 11, 3, 13, 5, 15, 7});
30277 } else {
30278 // SRA upper i32, SRL whole i64 and select lower i32.
30280 ShiftAmt, DAG);
30281 SDValue Lower =
30282 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30283 Lower = DAG.getBitcast(ExVT, Lower);
30284 if (VT == MVT::v2i64)
30285 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30286 if (VT == MVT::v4i64)
30287 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30288 {8, 1, 10, 3, 12, 5, 14, 7});
30289 }
30290 return DAG.getBitcast(VT, Ex);
30291 };
30292
30293 // Optimize shl/srl/sra with constant shift amount.
30294 APInt APIntShiftAmt;
30295 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30296 return SDValue();
30297
30298 // If the shift amount is out of range, return undef.
30299 if (APIntShiftAmt.uge(EltSizeInBits))
30300 return DAG.getUNDEF(VT);
30301
30302 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30303
30304 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30305 // Hardware support for vector shifts is sparse which makes us scalarize the
30306 // vector operations in many cases. Also, on sandybridge ADD is faster than
30307 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30308 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30309 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30310 // must be 0). (add undef, undef) however can be any value. To make this
30311 // safe, we must freeze R to ensure that register allocation uses the same
30312 // register for an undefined value. This ensures that the result will
30313 // still be even and preserves the original semantics.
30314 R = DAG.getFreeze(R);
30315 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30316 }
30317
30318 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30319 }
30320
30321 // i64 SRA needs to be performed as partial shifts.
30322 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30323 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30324 Op.getOpcode() == ISD::SRA)
30325 return ArithmeticShiftRight64(ShiftAmt);
30326
30327 // If we're logical shifting an all-signbits value then we can just perform as
30328 // a mask.
30329 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30330 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30331 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30332 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30333 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30334 }
30335
30336 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30337 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30338 unsigned NumElts = VT.getVectorNumElements();
30339 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30340
30341 // Simple i8 add case
30342 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30343 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30344 // must be 0). (add undef, undef) however can be any value. To make this
30345 // safe, we must freeze R to ensure that register allocation uses the same
30346 // register for an undefined value. This ensures that the result will
30347 // still be even and preserves the original semantics.
30348 R = DAG.getFreeze(R);
30349 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30350 }
30351
30352 // ashr(R, 7) === cmp_slt(R, 0)
30353 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30354 SDValue Zeros = DAG.getConstant(0, dl, VT);
30355 if (VT.is512BitVector()) {
30356 assert(VT == MVT::v64i8 && "Unexpected element type!");
30357 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30358 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30359 }
30360 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30361 }
30362
30363 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30364 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30365 return SDValue();
30366
30367 if (Subtarget.hasGFNI()) {
30368 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30369 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30370 DAG.getTargetConstant(0, dl, MVT::i8));
30371 }
30372
30373 if (Op.getOpcode() == ISD::SHL) {
30374 // Make a large shift.
30375 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30376 ShiftAmt, DAG);
30377 SHL = DAG.getBitcast(VT, SHL);
30378 // Zero out the rightmost bits.
30379 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30380 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30381 }
30382 if (Op.getOpcode() == ISD::SRL) {
30383 // Make a large shift.
30384 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30385 ShiftAmt, DAG);
30386 SRL = DAG.getBitcast(VT, SRL);
30387 // Zero out the leftmost bits.
30388 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30389 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30390 }
30391 if (Op.getOpcode() == ISD::SRA) {
30392 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30393 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30394
30395 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30396 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30397 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30398 return Res;
30399 }
30400 llvm_unreachable("Unknown shift opcode.");
30401 }
30402
30403 return SDValue();
30404}
30405
30407 const X86Subtarget &Subtarget) {
30408 MVT VT = Op.getSimpleValueType();
30409 SDLoc dl(Op);
30410 SDValue R = Op.getOperand(0);
30411 SDValue Amt = Op.getOperand(1);
30412 unsigned Opcode = Op.getOpcode();
30413 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30414
30415 int BaseShAmtIdx = -1;
30416 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30417 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30418 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30419 Subtarget, DAG);
30420
30421 // vXi8 shifts - shift as v8i16 + mask result.
30422 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30423 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30424 VT == MVT::v64i8) &&
30425 !Subtarget.hasXOP()) {
30426 unsigned NumElts = VT.getVectorNumElements();
30427 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30428 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30429 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30430 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30431
30432 // Create the mask using vXi16 shifts. For shift-rights we need to move
30433 // the upper byte down before splatting the vXi8 mask.
30434 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30435 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30436 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30437 if (Opcode != ISD::SHL)
30438 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30439 8, DAG);
30440 BitMask = DAG.getBitcast(VT, BitMask);
30441 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30442 SmallVector<int, 64>(NumElts, 0));
30443
30444 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30445 DAG.getBitcast(ExtVT, R), BaseShAmt,
30446 BaseShAmtIdx, Subtarget, DAG);
30447 Res = DAG.getBitcast(VT, Res);
30448 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30449
30450 if (Opcode == ISD::SRA) {
30451 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30452 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30453 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30454 SignMask =
30455 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30456 BaseShAmtIdx, Subtarget, DAG);
30457 SignMask = DAG.getBitcast(VT, SignMask);
30458 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30459 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30460 }
30461 return Res;
30462 }
30463 }
30464 }
30465
30466 return SDValue();
30467}
30468
30469// Convert a shift/rotate left amount to a multiplication scale factor.
30471 const X86Subtarget &Subtarget,
30472 SelectionDAG &DAG) {
30473 MVT VT = Amt.getSimpleValueType();
30474 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30475 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30476 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30477 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30478 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30479 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30480 return SDValue();
30481
30482 MVT SVT = VT.getVectorElementType();
30483 unsigned SVTBits = SVT.getSizeInBits();
30484 unsigned NumElems = VT.getVectorNumElements();
30485
30486 APInt UndefElts;
30487 SmallVector<APInt> EltBits;
30488 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30489 APInt One(SVTBits, 1);
30490 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30491 for (unsigned I = 0; I != NumElems; ++I) {
30492 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30493 continue;
30494 uint64_t ShAmt = EltBits[I].getZExtValue();
30495 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30496 }
30497 return DAG.getBuildVector(VT, dl, Elts);
30498 }
30499
30500 // If the target doesn't support variable shifts, use either FP conversion
30501 // or integer multiplication to avoid shifting each element individually.
30502 if (VT == MVT::v4i32) {
30503 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30504 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30505 DAG.getConstant(0x3f800000U, dl, VT));
30506 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30507 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30508 }
30509
30510 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30511 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30512 SDValue Z = DAG.getConstant(0, dl, VT);
30513 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30514 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30515 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30516 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30517 if (Subtarget.hasSSE41())
30518 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30519 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30520 }
30521
30522 return SDValue();
30523}
30524
30525static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30526 SelectionDAG &DAG) {
30527 MVT VT = Op.getSimpleValueType();
30528 SDLoc dl(Op);
30529 SDValue R = Op.getOperand(0);
30530 SDValue Amt = Op.getOperand(1);
30531 unsigned NumElts = VT.getVectorNumElements();
30532 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30533 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30534
30535 unsigned Opc = Op.getOpcode();
30536 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30537 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30538
30539 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30540 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30541
30542 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30543 return V;
30544
30545 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30546 return V;
30547
30548 if (supportedVectorVarShift(VT, Subtarget, Opc))
30549 return Op;
30550
30551 // i64 vector arithmetic shift can be emulated with the transform:
30552 // M = lshr(SIGN_MASK, Amt)
30553 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30554 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30555 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30556 Opc == ISD::SRA) {
30557 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30558 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30559 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30560 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30561 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30562 return R;
30563 }
30564
30565 // XOP has 128-bit variable logical/arithmetic shifts.
30566 // +ve/-ve Amt = shift left/right.
30567 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30568 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30569 if (Opc == ISD::SRL || Opc == ISD::SRA)
30570 Amt = DAG.getNegative(Amt, dl, VT);
30571 if (Opc == ISD::SHL || Opc == ISD::SRL)
30572 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30573 if (Opc == ISD::SRA)
30574 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30575 }
30576
30577 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30578 // shifts per-lane and then shuffle the partial results back together.
30579 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30580 // Splat the shift amounts so the scalar shifts above will catch it.
30581 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30582 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30583 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30584 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30585 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30586 }
30587
30588 // Build a map of inrange constant amounts with element mask where they occur.
30590 if (ConstantAmt) {
30591 for (unsigned I = 0; I != NumElts; ++I) {
30592 SDValue A = Amt.getOperand(I);
30593 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30594 continue;
30595 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30596 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30597 if (!Inserted) {
30598 It->second.setBit(I);
30599 continue;
30600 }
30601 It->second = APInt::getOneBitSet(NumElts, I);
30602 }
30603 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30604 }
30605
30606 // If possible, lower this shift as a sequence of two shifts by
30607 // constant plus a BLENDing shuffle instead of scalarizing it.
30608 // Example:
30609 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30610 //
30611 // Could be rewritten as:
30612 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30613 //
30614 // The advantage is that the two shifts from the example would be
30615 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30616 if (UniqueCstAmt.size() == 2 &&
30617 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30618 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30619 unsigned AmtA = UniqueCstAmt.begin()->first;
30620 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30621 const APInt &MaskA = UniqueCstAmt.begin()->second;
30622 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30623 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30624 for (unsigned I = 0; I != NumElts; ++I) {
30625 if (MaskA[I])
30626 ShuffleMask[I] = I;
30627 if (MaskB[I])
30628 ShuffleMask[I] = I + NumElts;
30629 }
30630
30631 // Only perform this blend if we can perform it without loading a mask.
30632 if ((VT != MVT::v16i16 ||
30633 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30634 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30635 canWidenShuffleElements(ShuffleMask))) {
30636 SDValue Shift1 =
30637 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30638 SDValue Shift2 =
30639 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30640 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30641 }
30642 }
30643
30644 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30645 // using vYiM vector operations where X*N == Y*M and M > N.
30646 if (ConstantAmt &&
30647 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30648 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30649 !Subtarget.hasXOP()) {
30650 MVT NarrowScalarVT = VT.getScalarType();
30651 // We can do this extra fast if each pair of narrow elements is shifted by
30652 // the same amount by doing this SWAR style: use a shift to move the valid
30653 // bits to the right position, mask out any bits which crossed from one
30654 // element to the other.
30655 // This optimized lowering is only valid if the elements in a pair can
30656 // be treated identically.
30657 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30658 SmallVector<SDValue, 32> TmpAmtWideElts;
30659 int WideEltSizeInBits = EltSizeInBits;
30660 while (WideEltSizeInBits < 32) {
30661 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30662 // unprofitable.
30663 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30664 break;
30665 }
30666 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30667 bool SameShifts = true;
30668 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30669 unsigned DstI = SrcI / 2;
30670 // Both elements are undef? Make a note and keep going.
30671 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30672 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30673 continue;
30674 }
30675 // Even element is undef? We will shift it by the same shift amount as
30676 // the odd element.
30677 if (AmtWideElts[SrcI].isUndef()) {
30678 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30679 continue;
30680 }
30681 // Odd element is undef? We will shift it by the same shift amount as
30682 // the even element.
30683 if (AmtWideElts[SrcI + 1].isUndef()) {
30684 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30685 continue;
30686 }
30687 // Both elements are equal.
30688 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30689 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30690 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30691 continue;
30692 }
30693 // One of the provisional wide elements will not have the same shift
30694 // amount. Let's bail.
30695 SameShifts = false;
30696 break;
30697 }
30698 if (!SameShifts) {
30699 break;
30700 }
30701 WideEltSizeInBits *= 2;
30702 std::swap(TmpAmtWideElts, AmtWideElts);
30703 }
30704 APInt APIntShiftAmt;
30705 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30706 bool Profitable = WidenShift;
30707 // AVX512BW brings support for vpsllvw.
30708 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30709 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30710 Profitable = false;
30711 }
30712 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30713 // fairly cheaply in other ways.
30714 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30715 Profitable = false;
30716 }
30717 // Leave it up to GFNI if we have it around.
30718 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30719 // is probably a win to use other strategies in some cases.
30720 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30721 Profitable = false;
30722 }
30723
30724 // AVX1 does not have vpand which makes our masking impractical. It does
30725 // have vandps but that is an FP instruction and crossing FP<->int typically
30726 // has some cost.
30727 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30728 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30729 Profitable = false;
30730 }
30731 unsigned WideNumElts = AmtWideElts.size();
30732 // We are only dealing with identical pairs.
30733 if (Profitable && WideNumElts != NumElts) {
30734 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30735 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30736 // Cast the operand to vXiM.
30737 SDValue RWide = DAG.getBitcast(WideVT, R);
30738 // Create our new vector of shift amounts.
30739 SDValue AmtWide = DAG.getBuildVector(
30740 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30741 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30742 // Perform the actual shift.
30743 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30744 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30745 // Now we need to construct a mask which will "drop" bits that get
30746 // shifted past the LSB/MSB. For a logical shift left, it will look
30747 // like:
30748 // FullMask = (1 << EltSizeInBits) - 1
30749 // Mask = FullMask << Amt
30750 //
30751 // This masking ensures that bits cannot migrate from one narrow lane to
30752 // another. The construction of this mask will be constant folded.
30753 // The mask for a logical right shift is nearly identical, the only
30754 // difference is that the all ones mask is shifted right instead of left.
30755 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30756 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30757 Mask = DAG.getBitcast(WideVT, Mask);
30758 // Finally, we mask the shifted vector with the SWAR mask.
30759 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30760 Masked = DAG.getBitcast(VT, Masked);
30761 if (Opc != ISD::SRA) {
30762 // Logical shifts are complete at this point.
30763 return Masked;
30764 }
30765 // At this point, we have done a *logical* shift right. We now need to
30766 // sign extend the result so that we get behavior equivalent to an
30767 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30768 // are `EltSizeInBits-AmtWide` bits wide.
30769 //
30770 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30771 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30772 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30773 // can use the following trick to accomplish this:
30774 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30775 // (Masked ^ SignBitMask) - SignBitMask
30776 //
30777 // When the sign bit is already clear, this will compute:
30778 // Masked + SignBitMask - SignBitMask
30779 //
30780 // This is equal to Masked which is what we want: the sign bit was clear
30781 // so sign extending should be a no-op.
30782 //
30783 // When the sign bit is set, this will compute:
30784 // Masked - SignBitmask - SignBitMask
30785 //
30786 // This is equal to Masked - 2*SignBitMask which will correctly sign
30787 // extend our result.
30788 SDValue SplatHighBit =
30789 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30790 // This does not induce recursion, all operands are constants.
30791 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30792 SDValue FlippedSignBit =
30793 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30794 SDValue Subtraction =
30795 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30796 return Subtraction;
30797 }
30798 }
30799
30800 // If possible, lower this packed shift into a vector multiply instead of
30801 // expanding it into a sequence of scalar shifts.
30802 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30803 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30804 Subtarget.canExtendTo512BW())))
30805 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30806 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30807
30808 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30809 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30810 if (Opc == ISD::SRL && ConstantAmt &&
30811 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30812 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30813 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30814 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30815 SDValue Zero = DAG.getConstant(0, dl, VT);
30816 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30817 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30818 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30819 }
30820 }
30821
30822 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30823 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30824 // TODO: Special case handling for shift by 0/1, really we can afford either
30825 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30826 if (Opc == ISD::SRA && ConstantAmt &&
30827 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30828 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30829 !Subtarget.hasAVX512()) ||
30830 DAG.isKnownNeverZero(Amt))) {
30831 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30832 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30833 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30834 SDValue Amt0 =
30835 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30836 SDValue Amt1 =
30837 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30838 SDValue Sra1 =
30839 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30840 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30841 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30842 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30843 }
30844 }
30845
30846 // v4i32 Non Uniform Shifts.
30847 // If the shift amount is constant we can shift each lane using the SSE2
30848 // immediate shifts, else we need to zero-extend each lane to the lower i64
30849 // and shift using the SSE2 variable shifts.
30850 // The separate results can then be blended together.
30851 if (VT == MVT::v4i32) {
30852 SDValue Amt0, Amt1, Amt2, Amt3;
30853 if (ConstantAmt) {
30854 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30855 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30856 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30857 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30858 } else {
30859 // The SSE2 shifts use the lower i64 as the same shift amount for
30860 // all lanes and the upper i64 is ignored. On AVX we're better off
30861 // just zero-extending, but for SSE just duplicating the top 16-bits is
30862 // cheaper and has the same effect for out of range values.
30863 if (Subtarget.hasAVX()) {
30864 SDValue Z = DAG.getConstant(0, dl, VT);
30865 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30866 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30867 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30868 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30869 } else {
30870 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30871 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30872 {4, 5, 6, 7, -1, -1, -1, -1});
30873 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30874 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30875 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30876 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30877 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30878 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30879 }
30880 }
30881
30882 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30883 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30884 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30885 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30886 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30887
30888 // Merge the shifted lane results optimally with/without PBLENDW.
30889 // TODO - ideally shuffle combining would handle this.
30890 if (Subtarget.hasSSE41()) {
30891 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30892 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30893 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30894 }
30895 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30896 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30897 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30898 }
30899
30900 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30901 // look up the pre-computed shift values.
30902 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30903 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30904 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30905 unsigned NumLanes = VT.getSizeInBits() / 128u;
30906 unsigned NumEltsPerLane = NumElts / NumLanes;
30908 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30909 unsigned LoElt = Lane * NumEltsPerLane;
30910 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30911 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30912 if (!KnownLane.isConstant())
30913 break;
30914 const APInt &LaneSplat = KnownLane.getConstant();
30915 for (unsigned I = 0; I != 8; ++I) {
30916 if (Opc == ISD::SHL)
30917 LUT.push_back(LaneSplat.shl(I));
30918 else if (Opc == ISD::SRL)
30919 LUT.push_back(LaneSplat.lshr(I));
30920 else if (Opc == ISD::SRA)
30921 LUT.push_back(LaneSplat.ashr(I));
30922 }
30923 LUT.append(8, APInt::getZero(8));
30924 }
30925 if (LUT.size() == NumElts) {
30926 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30927 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30928 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30929 }
30930 }
30931
30932 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30933 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30934 // make the existing SSE solution better.
30935 // NOTE: We honor prefered vector width before promoting to 512-bits.
30936 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30937 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30938 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30939 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30940 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30941 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30942 "Unexpected vector type");
30943 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30944 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30945 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30946 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30947 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30948 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30949 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30950 }
30951
30952 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30953 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30954 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30955 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30956 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30957 !Subtarget.hasXOP()) {
30958 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30959 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30960
30961 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30962 // isn't legal).
30963 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30964 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30965 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30966 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30968 "Constant build vector expected");
30969
30970 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30971 bool IsSigned = Opc == ISD::SRA;
30972 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30973 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30974 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30975 return DAG.getZExtOrTrunc(R, dl, VT);
30976 }
30977
30978 SmallVector<SDValue, 16> LoAmt, HiAmt;
30979 for (unsigned i = 0; i != NumElts; i += 16) {
30980 for (int j = 0; j != 8; ++j) {
30981 LoAmt.push_back(Amt.getOperand(i + j));
30982 HiAmt.push_back(Amt.getOperand(i + j + 8));
30983 }
30984 }
30985
30986 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30987 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30988
30989 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30990 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30991 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30992 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30993 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30994 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30995 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30996 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30997 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30998 }
30999
31000 if (VT == MVT::v16i8 ||
31001 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31002 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31003 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31004
31005 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31006 if (VT.is512BitVector()) {
31007 // On AVX512BW targets we make use of the fact that VSELECT lowers
31008 // to a masked blend which selects bytes based just on the sign bit
31009 // extracted to a mask.
31010 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31011 V0 = DAG.getBitcast(VT, V0);
31012 V1 = DAG.getBitcast(VT, V1);
31013 Sel = DAG.getBitcast(VT, Sel);
31014 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31015 ISD::SETGT);
31016 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31017 } else if (Subtarget.hasSSE41()) {
31018 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31019 // on the sign bit.
31020 V0 = DAG.getBitcast(VT, V0);
31021 V1 = DAG.getBitcast(VT, V1);
31022 Sel = DAG.getBitcast(VT, Sel);
31023 return DAG.getBitcast(SelVT,
31024 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31025 }
31026 // On pre-SSE41 targets we test for the sign bit by comparing to
31027 // zero - a negative value will set all bits of the lanes to true
31028 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31029 SDValue Z = DAG.getConstant(0, dl, SelVT);
31030 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31031 return DAG.getSelect(dl, SelVT, C, V0, V1);
31032 };
31033
31034 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31035 // We can safely do this using i16 shifts as we're only interested in
31036 // the 3 lower bits of each byte.
31037 Amt = DAG.getBitcast(ExtVT, Amt);
31038 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31039 Amt = DAG.getBitcast(VT, Amt);
31040
31041 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31042 // r = VSELECT(r, shift(r, 4), a);
31043 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31044 R = SignBitSelect(VT, Amt, M, R);
31045
31046 // a += a
31047 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31048
31049 // r = VSELECT(r, shift(r, 2), a);
31050 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31051 R = SignBitSelect(VT, Amt, M, R);
31052
31053 // a += a
31054 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31055
31056 // return VSELECT(r, shift(r, 1), a);
31057 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31058 R = SignBitSelect(VT, Amt, M, R);
31059 return R;
31060 }
31061
31062 if (Opc == ISD::SRA) {
31063 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31064 // so we can correctly sign extend. We don't care what happens to the
31065 // lower byte.
31066 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31067 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31068 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31069 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31070 ALo = DAG.getBitcast(ExtVT, ALo);
31071 AHi = DAG.getBitcast(ExtVT, AHi);
31072 RLo = DAG.getBitcast(ExtVT, RLo);
31073 RHi = DAG.getBitcast(ExtVT, RHi);
31074
31075 // r = VSELECT(r, shift(r, 4), a);
31076 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31077 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31078 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31079 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31080
31081 // a += a
31082 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31083 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31084
31085 // r = VSELECT(r, shift(r, 2), a);
31086 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31087 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31088 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31089 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31090
31091 // a += a
31092 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31093 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31094
31095 // r = VSELECT(r, shift(r, 1), a);
31096 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31097 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31098 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31099 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31100
31101 // Logical shift the result back to the lower byte, leaving a zero upper
31102 // byte meaning that we can safely pack with PACKUSWB.
31103 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31104 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31105 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31106 }
31107 }
31108
31109 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31110 MVT ExtVT = MVT::v8i32;
31111 SDValue Z = DAG.getConstant(0, dl, VT);
31112 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31113 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31114 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31115 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31116 ALo = DAG.getBitcast(ExtVT, ALo);
31117 AHi = DAG.getBitcast(ExtVT, AHi);
31118 RLo = DAG.getBitcast(ExtVT, RLo);
31119 RHi = DAG.getBitcast(ExtVT, RHi);
31120 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31121 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31122 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31123 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31124 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31125 }
31126
31127 if (VT == MVT::v8i16) {
31128 // If we have a constant shift amount, the non-SSE41 path is best as
31129 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31130 bool UseSSE41 = Subtarget.hasSSE41() &&
31132
31133 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31134 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31135 // the sign bit.
31136 if (UseSSE41) {
31137 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31138 V0 = DAG.getBitcast(ExtVT, V0);
31139 V1 = DAG.getBitcast(ExtVT, V1);
31140 Sel = DAG.getBitcast(ExtVT, Sel);
31141 return DAG.getBitcast(
31142 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31143 }
31144 // On pre-SSE41 targets we splat the sign bit - a negative value will
31145 // set all bits of the lanes to true and VSELECT uses that in
31146 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31147 SDValue C =
31148 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31149 return DAG.getSelect(dl, VT, C, V0, V1);
31150 };
31151
31152 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31153 if (UseSSE41) {
31154 // On SSE41 targets we need to replicate the shift mask in both
31155 // bytes for PBLENDVB.
31156 Amt = DAG.getNode(
31157 ISD::OR, dl, VT,
31158 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31159 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31160 } else {
31161 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31162 }
31163
31164 // r = VSELECT(r, shift(r, 8), a);
31165 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31166 R = SignBitSelect(Amt, M, R);
31167
31168 // a += a
31169 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31170
31171 // r = VSELECT(r, shift(r, 4), a);
31172 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31173 R = SignBitSelect(Amt, M, R);
31174
31175 // a += a
31176 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31177
31178 // r = VSELECT(r, shift(r, 2), a);
31179 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31180 R = SignBitSelect(Amt, M, R);
31181
31182 // a += a
31183 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31184
31185 // return VSELECT(r, shift(r, 1), a);
31186 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31187 R = SignBitSelect(Amt, M, R);
31188 return R;
31189 }
31190
31191 // Decompose 256-bit shifts into 128-bit shifts.
31192 if (VT.is256BitVector())
31193 return splitVectorIntBinary(Op, DAG, dl);
31194
31195 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31196 return splitVectorIntBinary(Op, DAG, dl);
31197
31198 return SDValue();
31199}
31200
31202 SelectionDAG &DAG) {
31203 MVT VT = Op.getSimpleValueType();
31204 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31205 "Unexpected funnel shift opcode!");
31206
31207 SDLoc DL(Op);
31208 SDValue Op0 = Op.getOperand(0);
31209 SDValue Op1 = Op.getOperand(1);
31210 SDValue Amt = Op.getOperand(2);
31211 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31212 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31213
31214 if (VT.isVector()) {
31215 APInt APIntShiftAmt;
31216 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31217 unsigned NumElts = VT.getVectorNumElements();
31218
31219 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31220 if (IsFSHR)
31221 std::swap(Op0, Op1);
31222
31223 if (IsCstSplat) {
31224 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31225 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31226 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31227 {Op0, Op1, Imm}, DAG, Subtarget);
31228 }
31229 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31230 {Op0, Op1, Amt}, DAG, Subtarget);
31231 }
31232 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31233 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31234 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31235 "Unexpected funnel shift type!");
31236
31237 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31238 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31239 if (IsCstSplat) {
31240 // TODO: Can't use generic expansion as UNDEF amt elements can be
31241 // converted to other values when folded to shift amounts, losing the
31242 // splat.
31243 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31244 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31245 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31246 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31247 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31248
31249 if (EltSizeInBits == 8 &&
31250 (Subtarget.hasXOP() ||
31251 (useVPTERNLOG(Subtarget, VT) &&
31252 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31253 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31254 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31255 // the original vector width to handle cases where we split.
31256 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31257 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31258 SDValue ShX =
31259 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31260 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31261 SDValue ShY =
31262 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31263 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31264 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31265 DAG.getConstant(MaskX, DL, VT));
31266 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31267 DAG.getConstant(MaskY, DL, VT));
31268 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31269 }
31270
31271 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31272 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31273 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31274 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31275 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31276 }
31277
31278 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31279 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31280 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31281
31282 // Constant vXi16 funnel shifts can be efficiently handled by default.
31283 if (IsCst && EltSizeInBits == 16)
31284 return SDValue();
31285
31286 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31287 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31288 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31289
31290 // Split 256-bit integers on XOP/pre-AVX2 targets.
31291 // Split 512-bit integers on non 512-bit BWI targets.
31292 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31293 !Subtarget.hasAVX2())) ||
31294 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31295 EltSizeInBits < 32)) {
31296 // Pre-mask the amount modulo using the wider vector.
31297 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31298 return splitVectorOp(Op, DAG, DL);
31299 }
31300
31301 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31302 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31303 int ScalarAmtIdx = -1;
31304 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31305 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31306 if (EltSizeInBits == 16)
31307 return SDValue();
31308
31309 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31310 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31311 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31312 ScalarAmtIdx, Subtarget, DAG);
31313 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31314 ScalarAmtIdx, Subtarget, DAG);
31315 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31316 }
31317 }
31318
31319 MVT WideSVT = MVT::getIntegerVT(
31320 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31321 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31322
31323 // If per-element shifts are legal, fallback to generic expansion.
31324 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31325 return SDValue();
31326
31327 // Attempt to fold as:
31328 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31329 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31330 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31331 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31332 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31333 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31334 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31335 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31336 EltSizeInBits, DAG);
31337 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31338 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31339 if (!IsFSHR)
31340 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31341 EltSizeInBits, DAG);
31342 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31343 }
31344
31345 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31346 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31347 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31348 SDValue Z = DAG.getConstant(0, DL, VT);
31349 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31350 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31351 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31352 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31353 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31354 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31355 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31356 }
31357
31358 // Fallback to generic expansion.
31359 return SDValue();
31360 }
31361 assert(
31362 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31363 "Unexpected funnel shift type!");
31364
31365 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31366 bool OptForSize = DAG.shouldOptForSize();
31367 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31368
31369 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31370 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31371 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31372 !isa<ConstantSDNode>(Amt)) {
31373 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31374 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31375 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31376 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31377 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31378 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31379 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31380 if (IsFSHR) {
31381 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31382 } else {
31383 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31384 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31385 }
31386 return DAG.getZExtOrTrunc(Res, DL, VT);
31387 }
31388
31389 if (VT == MVT::i8 || ExpandFunnel)
31390 return SDValue();
31391
31392 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31393 if (VT == MVT::i16) {
31394 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31395 DAG.getConstant(15, DL, Amt.getValueType()));
31396 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31397 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31398 }
31399
31400 return Op;
31401}
31402
31403static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31404 SelectionDAG &DAG) {
31405 MVT VT = Op.getSimpleValueType();
31406 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31407
31408 SDLoc DL(Op);
31409 SDValue R = Op.getOperand(0);
31410 SDValue Amt = Op.getOperand(1);
31411 unsigned Opcode = Op.getOpcode();
31412 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31413 int NumElts = VT.getVectorNumElements();
31414 bool IsROTL = Opcode == ISD::ROTL;
31415
31416 // Check for constant splat rotation amount.
31417 APInt CstSplatValue;
31418 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31419
31420 // Check for splat rotate by zero.
31421 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31422 return R;
31423
31424 // AVX512 implicitly uses modulo rotation amounts.
31425 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31426 // Attempt to rotate by immediate.
31427 if (IsCstSplat) {
31428 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31429 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31430 return DAG.getNode(RotOpc, DL, VT, R,
31431 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31432 }
31433
31434 // Else, fall-back on VPROLV/VPRORV.
31435 return Op;
31436 }
31437
31438 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31439 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31440 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31441 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31442 }
31443
31444 SDValue Z = DAG.getConstant(0, DL, VT);
31445
31446 if (!IsROTL) {
31447 // If the ISD::ROTR amount is constant, we're always better converting to
31448 // ISD::ROTL.
31449 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31450 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31451
31452 // XOP targets always prefers ISD::ROTL.
31453 if (Subtarget.hasXOP())
31454 return DAG.getNode(ISD::ROTL, DL, VT, R,
31455 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31456 }
31457
31458 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31459 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31461 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31462 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31463 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31464 DAG.getTargetConstant(0, DL, MVT::i8));
31465 }
31466
31467 // Split 256-bit integers on XOP/pre-AVX2 targets.
31468 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31469 return splitVectorIntBinary(Op, DAG, DL);
31470
31471 // XOP has 128-bit vector variable + immediate rotates.
31472 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31473 // XOP implicitly uses modulo rotation amounts.
31474 if (Subtarget.hasXOP()) {
31475 assert(IsROTL && "Only ROTL expected");
31476 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31477
31478 // Attempt to rotate by immediate.
31479 if (IsCstSplat) {
31480 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31481 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31482 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31483 }
31484
31485 // Use general rotate by variable (per-element).
31486 return Op;
31487 }
31488
31489 // Rotate by an uniform constant - expand back to shifts.
31490 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31491 // to other values when folded to shift amounts, losing the splat.
31492 if (IsCstSplat) {
31493 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31494 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31495 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31496 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31497 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31498 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31499 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31500 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31501 }
31502
31503 // Split 512-bit integers on non 512-bit BWI targets.
31504 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31505 return splitVectorIntBinary(Op, DAG, DL);
31506
31507 assert(
31508 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31509 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31510 Subtarget.hasAVX2()) ||
31511 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31512 "Only vXi32/vXi16/vXi8 vector rotates supported");
31513
31514 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31515 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31516
31517 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31518 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31519
31520 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31521 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31522 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31523 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31524 int BaseRotAmtIdx = -1;
31525 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31526 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31527 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31528 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31529 }
31530 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31531 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31532 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31533 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31534 BaseRotAmtIdx, Subtarget, DAG);
31535 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31536 BaseRotAmtIdx, Subtarget, DAG);
31537 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31538 }
31539 }
31540
31541 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31542 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31543
31544 // Attempt to fold as unpack(x,x) << zext(y):
31545 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31546 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31547 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31548 if (!(ConstantAmt && EltSizeInBits != 8) &&
31549 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31550 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31551 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31552 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31553 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31554 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31555 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31556 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31557 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31558 }
31559
31560 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31561 // the amount bit.
31562 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31563 if (EltSizeInBits == 8) {
31564 MVT WideVT =
31565 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31566
31567 // Attempt to fold as:
31568 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31569 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31570 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31571 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31572 // If we're rotating by constant, just use default promotion.
31573 if (ConstantAmt)
31574 return SDValue();
31575 // See if we can perform this by widening to vXi16 or vXi32.
31576 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31577 R = DAG.getNode(
31578 ISD::OR, DL, WideVT, R,
31579 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31580 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31581 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31582 if (IsROTL)
31583 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31584 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31585 }
31586
31587 // We don't need ModuloAmt here as we just peek at individual bits.
31588 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31589 if (Subtarget.hasSSE41()) {
31590 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31591 // on the sign bit.
31592 V0 = DAG.getBitcast(VT, V0);
31593 V1 = DAG.getBitcast(VT, V1);
31594 Sel = DAG.getBitcast(VT, Sel);
31595 return DAG.getBitcast(SelVT,
31596 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31597 }
31598 // On pre-SSE41 targets we test for the sign bit by comparing to
31599 // zero - a negative value will set all bits of the lanes to true
31600 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31601 SDValue Z = DAG.getConstant(0, DL, SelVT);
31602 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31603 return DAG.getSelect(DL, SelVT, C, V0, V1);
31604 };
31605
31606 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31607 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31608 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31609 IsROTL = true;
31610 }
31611
31612 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31613 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31614
31615 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31616 // We can safely do this using i16 shifts as we're only interested in
31617 // the 3 lower bits of each byte.
31618 Amt = DAG.getBitcast(ExtVT, Amt);
31619 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31620 Amt = DAG.getBitcast(VT, Amt);
31621
31622 // r = VSELECT(r, rot(r, 4), a);
31623 SDValue M;
31624 M = DAG.getNode(
31625 ISD::OR, DL, VT,
31626 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31627 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31628 R = SignBitSelect(VT, Amt, M, R);
31629
31630 // a += a
31631 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31632
31633 // r = VSELECT(r, rot(r, 2), a);
31634 M = DAG.getNode(
31635 ISD::OR, DL, VT,
31636 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31637 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31638 R = SignBitSelect(VT, Amt, M, R);
31639
31640 // a += a
31641 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31642
31643 // return VSELECT(r, rot(r, 1), a);
31644 M = DAG.getNode(
31645 ISD::OR, DL, VT,
31646 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31647 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31648 return SignBitSelect(VT, Amt, M, R);
31649 }
31650
31651 bool IsSplatAmt = DAG.isSplatValue(Amt);
31652 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31653 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31654
31655 // Fallback for splats + all supported variable shifts.
31656 // Fallback for non-constants AVX2 vXi16 as well.
31657 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31658 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31659 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31660 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31661 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31662 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31663 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31664 }
31665
31666 // Everything below assumes ISD::ROTL.
31667 if (!IsROTL) {
31668 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31669 IsROTL = true;
31670 }
31671
31672 // ISD::ROT* uses modulo rotate amounts.
31673 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31674
31675 assert(IsROTL && "Only ROTL supported");
31676
31677 // As with shifts, attempt to convert the rotation amount to a multiplication
31678 // factor, fallback to general expansion.
31679 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31680 if (!Scale)
31681 return SDValue();
31682
31683 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31684 if (EltSizeInBits == 16) {
31685 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31686 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31687 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31688 }
31689
31690 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31691 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31692 // that can then be OR'd with the lower 32-bits.
31693 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31694 static const int OddMask[] = {1, 1, 3, 3};
31695 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31696 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31697
31698 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31699 DAG.getBitcast(MVT::v2i64, R),
31700 DAG.getBitcast(MVT::v2i64, Scale));
31701 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31702 DAG.getBitcast(MVT::v2i64, R13),
31703 DAG.getBitcast(MVT::v2i64, Scale13));
31704 Res02 = DAG.getBitcast(VT, Res02);
31705 Res13 = DAG.getBitcast(VT, Res13);
31706
31707 return DAG.getNode(ISD::OR, DL, VT,
31708 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31709 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31710}
31711
31712/// Returns true if the operand type is exactly twice the native width, and
31713/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31714/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31715/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31716bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31717 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31718
31719 if (OpWidth == 64)
31720 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31721 if (OpWidth == 128)
31722 return Subtarget.canUseCMPXCHG16B();
31723
31724 return false;
31725}
31726
31728X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31729 Type *MemType = SI->getValueOperand()->getType();
31730
31731 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31732 !Subtarget.useSoftFloat()) {
31733 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31734 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31736
31737 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31738 Subtarget.hasAVX())
31740 }
31741
31742 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31744}
31745
31746// Note: this turns large loads into lock cmpxchg8b/16b.
31748X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31749 Type *MemType = LI->getType();
31750
31751 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31752 !Subtarget.useSoftFloat()) {
31753 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31754 // can use movq to do the load. If we have X87 we can load into an 80-bit
31755 // X87 register and store it to a stack temporary.
31756 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31757 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31759
31760 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31761 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31762 Subtarget.hasAVX())
31764 }
31765
31766 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31768}
31769
31777
31778static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31779 using namespace llvm::PatternMatch;
31780 BitTestKind BTK = UndefBit;
31781 if (auto *C = dyn_cast<ConstantInt>(V)) {
31782 // Check if V is a power of 2 or NOT power of 2.
31783 if (isPowerOf2_64(C->getZExtValue()))
31784 BTK = ConstantBit;
31785 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31786 BTK = NotConstantBit;
31787 return {V, BTK};
31788 }
31789
31790 // Check if V is some power of 2 pattern known to be non-zero
31791 if (auto *I = dyn_cast<Instruction>(V)) {
31792 bool Not = false;
31793 // Check if we have a NOT
31794 Value *PeekI;
31795 if (match(I, m_Not(m_Value(PeekI))) ||
31796 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31797 Not = true;
31798 I = dyn_cast<Instruction>(PeekI);
31799
31800 // If I is constant, it will fold and we can evaluate later. If its an
31801 // argument or something of that nature, we can't analyze.
31802 if (I == nullptr)
31803 return {nullptr, UndefBit};
31804 }
31805 // We can only use 1 << X without more sophisticated analysis. C << X where
31806 // C is a power of 2 but not 1 can result in zero which cannot be translated
31807 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31808 if (I->getOpcode() == Instruction::Shl) {
31809 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31810 // -X` and some other provable power of 2 patterns that we can use CTZ on
31811 // may be profitable.
31812 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31813 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31814 // be provably a non-zero power of 2.
31815 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31816 // transformable to bittest.
31817 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31818 if (!ShiftVal)
31819 return {nullptr, UndefBit};
31820 if (ShiftVal->equalsInt(1))
31821 BTK = Not ? NotShiftBit : ShiftBit;
31822
31823 if (BTK == UndefBit)
31824 return {nullptr, UndefBit};
31825
31826 Value *BitV = I->getOperand(1);
31827
31828 // Read past a shiftmask instruction to find count
31829 Value *AndOp;
31830 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31831 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31832 BitV = AndOp;
31833
31834 return {BitV, BTK};
31835 }
31836 }
31837 return {nullptr, UndefBit};
31838}
31839
31841X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31842 using namespace llvm::PatternMatch;
31843 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31844 // prefix to a normal instruction for these operations.
31845 if (AI->use_empty())
31847
31848 if (AI->getOperation() == AtomicRMWInst::Xor) {
31849 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31850 // preferable to both `cmpxchg` and `btc`.
31851 if (match(AI->getOperand(1), m_SignMask()))
31853 }
31854
31855 // If the atomicrmw's result is used by a single bit AND, we may use
31856 // bts/btr/btc instruction for these operations.
31857 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31858 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31859 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31860 // detect it.
31861 Instruction *I = AI->user_back();
31862 auto BitChange = FindSingleBitChange(AI->getValOperand());
31863 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31864 I->getOpcode() != Instruction::And ||
31865 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31866 AI->getParent() != I->getParent())
31868
31869 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31870
31871 // This is a redundant AND, it should get cleaned up elsewhere.
31872 if (AI == I->getOperand(OtherIdx))
31874
31875 // The following instruction must be a AND single bit.
31876 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31877 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31878 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31879 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31881 }
31882 if (AI->getOperation() == AtomicRMWInst::And) {
31883 return ~C1->getValue() == C2->getValue()
31886 }
31889 }
31890
31891 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31892
31893 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31894 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31896
31897 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31898
31899 // If shift amounts are not the same we can't use BitTestIntrinsic.
31900 if (BitChange.first != BitTested.first)
31902
31903 // If atomic AND need to be masking all be one bit and testing the one bit
31904 // unset in the mask.
31905 if (AI->getOperation() == AtomicRMWInst::And)
31906 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31909
31910 // If atomic XOR/OR need to be setting and testing the same bit.
31911 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31914}
31915
31916void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31917 IRBuilder<> Builder(AI);
31918 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31921 switch (AI->getOperation()) {
31922 default:
31923 llvm_unreachable("Unknown atomic operation");
31924 case AtomicRMWInst::Or:
31925 IID_C = Intrinsic::x86_atomic_bts;
31926 IID_I = Intrinsic::x86_atomic_bts_rm;
31927 break;
31928 case AtomicRMWInst::Xor:
31929 IID_C = Intrinsic::x86_atomic_btc;
31930 IID_I = Intrinsic::x86_atomic_btc_rm;
31931 break;
31932 case AtomicRMWInst::And:
31933 IID_C = Intrinsic::x86_atomic_btr;
31934 IID_I = Intrinsic::x86_atomic_btr_rm;
31935 break;
31936 }
31937 Instruction *I = AI->user_back();
31938 LLVMContext &Ctx = AI->getContext();
31939 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31941 Value *Result = nullptr;
31942 auto BitTested = FindSingleBitChange(AI->getValOperand());
31943 assert(BitTested.first != nullptr);
31944
31945 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31946 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31947
31948 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31949 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31950 {Addr, Builder.getInt8(Imm)});
31951 } else {
31952 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31953
31954 Value *SI = BitTested.first;
31955 assert(SI != nullptr);
31956
31957 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31958 // mask it.
31959 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31960 Value *BitPos =
31961 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31962 // Todo(1): In many cases it may be provable that SI is less than
31963 // ShiftBits in which case this mask is unnecessary
31964 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31965 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31966 // favor of just a raw BT{S|R|C}.
31967
31968 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31969 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31970
31971 // If the result is only used for zero/non-zero status then we don't need to
31972 // shift value back. Otherwise do so.
31973 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31974 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31975 if (ICmp->isEquality()) {
31976 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31977 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31978 if (C0 || C1) {
31979 assert(C0 == nullptr || C1 == nullptr);
31980 if ((C0 ? C0 : C1)->isZero())
31981 continue;
31982 }
31983 }
31984 }
31985 Result = Builder.CreateShl(Result, BitPos);
31986 break;
31987 }
31988 }
31989
31990 I->replaceAllUsesWith(Result);
31991 I->eraseFromParent();
31992 AI->eraseFromParent();
31993}
31994
31996 using namespace llvm::PatternMatch;
31997 if (!AI->hasOneUse())
31998 return false;
31999
32000 Value *Op = AI->getOperand(1);
32001 CmpPredicate Pred;
32002 Instruction *I = AI->user_back();
32004 if (Opc == AtomicRMWInst::Add) {
32005 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32006 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32007 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32008 if (match(I->user_back(),
32010 return true;
32011 if (match(I->user_back(),
32013 return true;
32014 }
32015 return false;
32016 }
32017 if (Opc == AtomicRMWInst::Sub) {
32018 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32019 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32020 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32021 if (match(I->user_back(),
32023 return true;
32024 if (match(I->user_back(),
32026 return true;
32027 }
32028 return false;
32029 }
32030 if ((Opc == AtomicRMWInst::Or &&
32032 (Opc == AtomicRMWInst::And &&
32034 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32035 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32036 Pred == CmpInst::ICMP_SLT;
32037 if (match(I->user_back(),
32039 return true;
32040 return false;
32041 }
32042 if (Opc == AtomicRMWInst::Xor) {
32043 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32044 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32045 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32046 if (match(I->user_back(),
32048 return true;
32049 if (match(I->user_back(),
32051 return true;
32052 }
32053 return false;
32054 }
32055
32056 return false;
32057}
32058
32059void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32060 AtomicRMWInst *AI) const {
32061 IRBuilder<> Builder(AI);
32062 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32063 Instruction *TempI = nullptr;
32064 LLVMContext &Ctx = AI->getContext();
32065 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32066 if (!ICI) {
32067 TempI = AI->user_back();
32068 assert(TempI->hasOneUse() && "Must have one use");
32069 ICI = cast<ICmpInst>(TempI->user_back());
32070 }
32072 ICmpInst::Predicate Pred = ICI->getPredicate();
32073 switch (Pred) {
32074 default:
32075 llvm_unreachable("Not supported Pred");
32076 case CmpInst::ICMP_EQ:
32077 CC = X86::COND_E;
32078 break;
32079 case CmpInst::ICMP_NE:
32080 CC = X86::COND_NE;
32081 break;
32082 case CmpInst::ICMP_SLT:
32083 CC = X86::COND_S;
32084 break;
32085 case CmpInst::ICMP_SGT:
32086 CC = X86::COND_NS;
32087 break;
32088 }
32090 switch (AI->getOperation()) {
32091 default:
32092 llvm_unreachable("Unknown atomic operation");
32093 case AtomicRMWInst::Add:
32094 IID = Intrinsic::x86_atomic_add_cc;
32095 break;
32096 case AtomicRMWInst::Sub:
32097 IID = Intrinsic::x86_atomic_sub_cc;
32098 break;
32099 case AtomicRMWInst::Or:
32100 IID = Intrinsic::x86_atomic_or_cc;
32101 break;
32102 case AtomicRMWInst::And:
32103 IID = Intrinsic::x86_atomic_and_cc;
32104 break;
32105 case AtomicRMWInst::Xor:
32106 IID = Intrinsic::x86_atomic_xor_cc;
32107 break;
32108 }
32109 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32111 Value *Call = Builder.CreateIntrinsic(
32112 IID, AI->getType(),
32113 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32114 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32115 ICI->replaceAllUsesWith(Result);
32116 ICI->eraseFromParent();
32117 if (TempI)
32118 TempI->eraseFromParent();
32119 AI->eraseFromParent();
32120}
32121
32123X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32124 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32125 Type *MemType = AI->getType();
32126
32127 // If the operand is too big, we must see if cmpxchg8/16b is available
32128 // and default to library calls otherwise.
32129 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32130 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32132 }
32133
32135 switch (Op) {
32138 case AtomicRMWInst::Add:
32139 case AtomicRMWInst::Sub:
32142 // It's better to use xadd, xsub or xchg for these in other cases.
32144 case AtomicRMWInst::Or:
32145 case AtomicRMWInst::And:
32146 case AtomicRMWInst::Xor:
32149 return shouldExpandLogicAtomicRMWInIR(AI);
32151 case AtomicRMWInst::Max:
32152 case AtomicRMWInst::Min:
32163 default:
32164 // These always require a non-trivial set of data operations on x86. We must
32165 // use a cmpxchg loop.
32167 }
32168}
32169
32170LoadInst *
32171X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32172 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32173 Type *MemType = AI->getType();
32174 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32175 // there is no benefit in turning such RMWs into loads, and it is actually
32176 // harmful as it introduces a mfence.
32177 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32178 return nullptr;
32179
32180 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32181 // lowering available in lowerAtomicArith.
32182 // TODO: push more cases through this path.
32183 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32184 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32185 AI->use_empty())
32186 return nullptr;
32187
32188 IRBuilder<> Builder(AI);
32189 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32190 auto SSID = AI->getSyncScopeID();
32191 // We must restrict the ordering to avoid generating loads with Release or
32192 // ReleaseAcquire orderings.
32194
32195 // Before the load we need a fence. Here is an example lifted from
32196 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32197 // is required:
32198 // Thread 0:
32199 // x.store(1, relaxed);
32200 // r1 = y.fetch_add(0, release);
32201 // Thread 1:
32202 // y.fetch_add(42, acquire);
32203 // r2 = x.load(relaxed);
32204 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32205 // lowered to just a load without a fence. A mfence flushes the store buffer,
32206 // making the optimization clearly correct.
32207 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32208 // otherwise, we might be able to be more aggressive on relaxed idempotent
32209 // rmw. In practice, they do not look useful, so we don't try to be
32210 // especially clever.
32211
32212 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32213 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32214 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32215
32216 // Finally we can emit the atomic load.
32217 LoadInst *Loaded = Builder.CreateAlignedLoad(
32218 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32219 Loaded->setAtomic(Order, SSID);
32220 AI->replaceAllUsesWith(Loaded);
32221 AI->eraseFromParent();
32222 return Loaded;
32223}
32224
32225/// Emit a locked operation on a stack location which does not change any
32226/// memory location, but does involve a lock prefix. Location is chosen to be
32227/// a) very likely accessed only by a single thread to minimize cache traffic,
32228/// and b) definitely dereferenceable. Returns the new Chain result.
32230 const X86Subtarget &Subtarget, SDValue Chain,
32231 const SDLoc &DL) {
32232 // Implementation notes:
32233 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32234 // operations issued by the current processor. As such, the location
32235 // referenced is not relevant for the ordering properties of the instruction.
32236 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32237 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32238 // 2) Using an immediate operand appears to be the best encoding choice
32239 // here since it doesn't require an extra register.
32240 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32241 // is small enough it might just be measurement noise.)
32242 // 4) When choosing offsets, there are several contributing factors:
32243 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32244 // line aligned stack object to improve this case.)
32245 // b) To minimize our chances of introducing a false dependence, we prefer
32246 // to offset the stack usage from TOS slightly.
32247 // c) To minimize concerns about cross thread stack usage - in particular,
32248 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32249 // captures state in the TOS frame and accesses it from many threads -
32250 // we want to use an offset such that the offset is in a distinct cache
32251 // line from the TOS frame.
32252 //
32253 // For a general discussion of the tradeoffs and benchmark results, see:
32254 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32255
32256 auto &MF = DAG.getMachineFunction();
32257 auto &TFL = *Subtarget.getFrameLowering();
32258 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32259
32260 if (Subtarget.is64Bit()) {
32261 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32262 SDValue Ops[] = {
32263 DAG.getRegister(X86::RSP, MVT::i64), // Base
32264 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32265 DAG.getRegister(0, MVT::i64), // Index
32266 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32267 DAG.getRegister(0, MVT::i16), // Segment.
32268 Zero,
32269 Chain};
32270 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32271 MVT::Other, Ops);
32272 return SDValue(Res, 1);
32273 }
32274
32275 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32276 SDValue Ops[] = {
32277 DAG.getRegister(X86::ESP, MVT::i32), // Base
32278 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32279 DAG.getRegister(0, MVT::i32), // Index
32280 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32281 DAG.getRegister(0, MVT::i16), // Segment.
32282 Zero,
32283 Chain
32284 };
32285 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32286 MVT::Other, Ops);
32287 return SDValue(Res, 1);
32288}
32289
32291 SelectionDAG &DAG) {
32292 SDLoc dl(Op);
32293 AtomicOrdering FenceOrdering =
32294 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32295 SyncScope::ID FenceSSID =
32296 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32297
32298 // The only fence that needs an instruction is a sequentially-consistent
32299 // cross-thread fence.
32300 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32301 FenceSSID == SyncScope::System) {
32302 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32303 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32304
32305 SDValue Chain = Op.getOperand(0);
32306 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32307 }
32308
32309 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32310 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32311}
32312
32314 SelectionDAG &DAG) {
32315 MVT T = Op.getSimpleValueType();
32316 SDLoc DL(Op);
32317 unsigned Reg = 0;
32318 unsigned size = 0;
32319 switch(T.SimpleTy) {
32320 default: llvm_unreachable("Invalid value type!");
32321 case MVT::i8: Reg = X86::AL; size = 1; break;
32322 case MVT::i16: Reg = X86::AX; size = 2; break;
32323 case MVT::i32: Reg = X86::EAX; size = 4; break;
32324 case MVT::i64:
32325 assert(Subtarget.is64Bit() && "Node not type legal!");
32326 Reg = X86::RAX; size = 8;
32327 break;
32328 }
32329 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32330 Op.getOperand(2), SDValue());
32331 SDValue Ops[] = { cpIn.getValue(0),
32332 Op.getOperand(1),
32333 Op.getOperand(3),
32334 DAG.getTargetConstant(size, DL, MVT::i8),
32335 cpIn.getValue(1) };
32336 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32337 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32339 Ops, T, MMO);
32340
32341 SDValue cpOut =
32342 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32343 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32344 MVT::i32, cpOut.getValue(2));
32345 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32346
32347 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32348 cpOut, Success, EFLAGS.getValue(1));
32349}
32350
32351// Create MOVMSKB, taking into account whether we need to split for AVX1.
32353 const X86Subtarget &Subtarget) {
32354 MVT InVT = V.getSimpleValueType();
32355
32356 if (InVT == MVT::v64i8) {
32357 SDValue Lo, Hi;
32358 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32359 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32360 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32361 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32362 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32363 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32364 DAG.getConstant(32, DL, MVT::i8));
32365 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32366 }
32367 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32368 SDValue Lo, Hi;
32369 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32370 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32371 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32372 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32373 DAG.getConstant(16, DL, MVT::i8));
32374 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32375 }
32376
32377 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32378}
32379
32380static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32381 SelectionDAG &DAG) {
32382 SDValue Src = Op.getOperand(0);
32383 MVT SrcVT = Src.getSimpleValueType();
32384 MVT DstVT = Op.getSimpleValueType();
32385
32386 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32387 // half to v32i1 and concatenating the result.
32388 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32389 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32390 assert(Subtarget.hasBWI() && "Expected BWI target");
32391 SDLoc dl(Op);
32392 SDValue Lo, Hi;
32393 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32394 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32395 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32396 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32397 }
32398
32399 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32400 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32401 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32402 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32403 SDLoc DL(Op);
32404 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32405 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32406 return DAG.getZExtOrTrunc(V, DL, DstVT);
32407 }
32408
32409 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32410 SrcVT == MVT::i64) && "Unexpected VT!");
32411
32412 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32413 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32414 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32415 // This conversion needs to be expanded.
32416 return SDValue();
32417
32418 SDLoc dl(Op);
32419 if (SrcVT.isVector()) {
32420 // Widen the vector in input in the case of MVT::v2i32.
32421 // Example: from MVT::v2i32 to MVT::v4i32.
32423 SrcVT.getVectorNumElements() * 2);
32424 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32425 DAG.getUNDEF(SrcVT));
32426 } else {
32427 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32428 "Unexpected source type in LowerBITCAST");
32429 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32430 }
32431
32432 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32433 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32434
32435 if (DstVT == MVT::x86mmx)
32436 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32437
32438 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32439 DAG.getVectorIdxConstant(0, dl));
32440}
32441
32442/// Compute the horizontal sum of bytes in V for the elements of VT.
32443///
32444/// Requires V to be a byte vector and VT to be an integer vector type with
32445/// wider elements than V's type. The width of the elements of VT determines
32446/// how many bytes of V are summed horizontally to produce each element of the
32447/// result.
32449 const X86Subtarget &Subtarget,
32450 SelectionDAG &DAG) {
32451 SDLoc DL(V);
32452 MVT ByteVecVT = V.getSimpleValueType();
32453 MVT EltVT = VT.getVectorElementType();
32454 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32455 "Expected value to have byte element type.");
32456 assert(EltVT != MVT::i8 &&
32457 "Horizontal byte sum only makes sense for wider elements!");
32458 unsigned VecSize = VT.getSizeInBits();
32459 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32460
32461 // PSADBW instruction horizontally add all bytes and leave the result in i64
32462 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32463 if (EltVT == MVT::i64) {
32464 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32465 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32466 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32467 return DAG.getBitcast(VT, V);
32468 }
32469
32470 if (EltVT == MVT::i32) {
32471 // We unpack the low half and high half into i32s interleaved with zeros so
32472 // that we can use PSADBW to horizontally sum them. The most useful part of
32473 // this is that it lines up the results of two PSADBW instructions to be
32474 // two v2i64 vectors which concatenated are the 4 population counts. We can
32475 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32476 SDValue Zeros = DAG.getConstant(0, DL, VT);
32477 SDValue V32 = DAG.getBitcast(VT, V);
32478 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32479 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32480
32481 // Do the horizontal sums into two v2i64s.
32482 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32483 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32484 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32485 DAG.getBitcast(ByteVecVT, Low), Zeros);
32486 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32487 DAG.getBitcast(ByteVecVT, High), Zeros);
32488
32489 // Merge them together.
32490 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32491 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32492 DAG.getBitcast(ShortVecVT, Low),
32493 DAG.getBitcast(ShortVecVT, High));
32494
32495 return DAG.getBitcast(VT, V);
32496 }
32497
32498 // The only element type left is i16.
32499 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32500
32501 // To obtain pop count for each i16 element starting from the pop count for
32502 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32503 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32504 // directly supported.
32505 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32506 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32507 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32508 DAG.getBitcast(ByteVecVT, V));
32509 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32510}
32511
32513 const X86Subtarget &Subtarget,
32514 SelectionDAG &DAG) {
32515 MVT VT = Op.getSimpleValueType();
32516 MVT EltVT = VT.getVectorElementType();
32517 int NumElts = VT.getVectorNumElements();
32518 (void)EltVT;
32519 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32520
32521 // Implement a lookup table in register by using an algorithm based on:
32522 // http://wm.ite.pl/articles/sse-popcount.html
32523 //
32524 // The general idea is that every lower byte nibble in the input vector is an
32525 // index into a in-register pre-computed pop count table. We then split up the
32526 // input vector in two new ones: (1) a vector with only the shifted-right
32527 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32528 // masked out higher ones) for each byte. PSHUFB is used separately with both
32529 // to index the in-register table. Next, both are added and the result is a
32530 // i8 vector where each element contains the pop count for input byte.
32531 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32532 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32533 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32534 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32535
32537 for (int i = 0; i < NumElts; ++i)
32538 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32539 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32540 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32541
32542 // High nibbles
32543 SDValue FourV = DAG.getConstant(4, DL, VT);
32544 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32545
32546 // Low nibbles
32547 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32548
32549 // The input vector is used as the shuffle mask that index elements into the
32550 // LUT. After counting low and high nibbles, add the vector to obtain the
32551 // final pop count per i8 element.
32552 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32553 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32554 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32555}
32556
32557// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32558// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32560 const X86Subtarget &Subtarget,
32561 SelectionDAG &DAG) {
32562 MVT VT = Op.getSimpleValueType();
32563 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32564 "Unknown CTPOP type to handle");
32565 SDValue Op0 = Op.getOperand(0);
32566
32567 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32568 if (Subtarget.hasVPOPCNTDQ()) {
32569 unsigned NumElems = VT.getVectorNumElements();
32570 assert((VT.getVectorElementType() == MVT::i8 ||
32571 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32572 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32573 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32574 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32575 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32576 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32577 }
32578 }
32579
32580 // Decompose 256-bit ops into smaller 128-bit ops.
32581 if (VT.is256BitVector() && !Subtarget.hasInt256())
32582 return splitVectorIntUnary(Op, DAG, DL);
32583
32584 // Decompose 512-bit ops into smaller 256-bit ops.
32585 if (VT.is512BitVector() && !Subtarget.hasBWI())
32586 return splitVectorIntUnary(Op, DAG, DL);
32587
32588 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32589 if (VT.getScalarType() != MVT::i8) {
32590 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32591 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32592 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32593 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32594 }
32595
32596 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32597 if (!Subtarget.hasSSSE3())
32598 return SDValue();
32599
32600 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32601}
32602
32603static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32604 SelectionDAG &DAG) {
32605 MVT VT = N.getSimpleValueType();
32606 SDValue Op = N.getOperand(0);
32607 SDLoc DL(N);
32608
32609 if (VT.isScalarInteger()) {
32610 // Compute the lower/upper bounds of the active bits of the value,
32611 // allowing us to shift the active bits down if necessary to fit into the
32612 // special cases below.
32613 KnownBits Known = DAG.computeKnownBits(Op);
32614 if (Known.isConstant())
32615 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32616 unsigned LZ = Known.countMinLeadingZeros();
32617 unsigned TZ = Known.countMinTrailingZeros();
32618 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32619 unsigned ActiveBits = Known.getBitWidth() - LZ;
32620 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32621
32622 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32623 if (ShiftedActiveBits <= 2) {
32624 if (ActiveBits > 2)
32625 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32626 DAG.getShiftAmountConstant(TZ, VT, DL));
32627 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32628 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32629 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32630 DAG.getShiftAmountConstant(1, VT, DL)));
32631 return DAG.getZExtOrTrunc(Op, DL, VT);
32632 }
32633
32634 // i3 CTPOP - perform LUT into i32 integer.
32635 if (ShiftedActiveBits <= 3) {
32636 if (ActiveBits > 3)
32637 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32638 DAG.getShiftAmountConstant(TZ, VT, DL));
32639 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32640 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32641 DAG.getShiftAmountConstant(1, VT, DL));
32642 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32643 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32644 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32645 DAG.getConstant(0x3, DL, MVT::i32));
32646 return DAG.getZExtOrTrunc(Op, DL, VT);
32647 }
32648
32649 // i4 CTPOP - perform LUT into i64 integer.
32650 if (ShiftedActiveBits <= 4 &&
32651 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32652 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32653 if (ActiveBits > 4)
32654 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32655 DAG.getShiftAmountConstant(TZ, VT, DL));
32656 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32657 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32658 DAG.getConstant(4, DL, MVT::i32));
32659 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32660 DAG.getShiftAmountOperand(MVT::i64, Op));
32661 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32662 DAG.getConstant(0x7, DL, MVT::i64));
32663 return DAG.getZExtOrTrunc(Op, DL, VT);
32664 }
32665
32666 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32667 if (ShiftedActiveBits <= 8) {
32668 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32669 if (ActiveBits > 8)
32670 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32671 DAG.getShiftAmountConstant(TZ, VT, DL));
32672 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32673 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32674 DAG.getConstant(0x08040201U, DL, MVT::i32));
32675 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32676 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32677 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32678 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32679 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32680 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32681 return DAG.getZExtOrTrunc(Op, DL, VT);
32682 }
32683
32684 return SDValue(); // fallback to generic expansion.
32685 }
32686
32687 assert(VT.isVector() &&
32688 "We only do custom lowering for vector population count.");
32689 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32690}
32691
32693 MVT VT = Op.getSimpleValueType();
32694 SDValue In = Op.getOperand(0);
32695 SDLoc DL(Op);
32696
32697 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32698 // perform the BITREVERSE.
32699 if (!VT.isVector()) {
32700 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32701 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32702 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32703 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32704 DAG.getVectorIdxConstant(0, DL));
32705 }
32706
32707 int NumElts = VT.getVectorNumElements();
32708 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32709
32710 // Decompose 256-bit ops into smaller 128-bit ops.
32711 if (VT.is256BitVector())
32712 return splitVectorIntUnary(Op, DAG, DL);
32713
32714 assert(VT.is128BitVector() &&
32715 "Only 128-bit vector bitreverse lowering supported.");
32716
32717 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32718 // perform the BSWAP in the shuffle.
32719 // Its best to shuffle using the second operand as this will implicitly allow
32720 // memory folding for multiple vectors.
32721 SmallVector<SDValue, 16> MaskElts;
32722 for (int i = 0; i != NumElts; ++i) {
32723 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32724 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32725 int PermuteByte = SourceByte | (2 << 5);
32726 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32727 }
32728 }
32729
32730 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32731 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32732 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32733 Res, Mask);
32734 return DAG.getBitcast(VT, Res);
32735}
32736
32738 SelectionDAG &DAG) {
32739 MVT VT = Op.getSimpleValueType();
32740
32741 if (Subtarget.hasXOP() && !VT.is512BitVector())
32742 return LowerBITREVERSE_XOP(Op, DAG);
32743
32744 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32745 "SSSE3 or GFNI required for BITREVERSE");
32746
32747 SDValue In = Op.getOperand(0);
32748 SDLoc DL(Op);
32749
32750 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32751 if (VT.is512BitVector() && !Subtarget.hasBWI())
32752 return splitVectorIntUnary(Op, DAG, DL);
32753
32754 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32755 if (VT.is256BitVector() && !Subtarget.hasInt256())
32756 return splitVectorIntUnary(Op, DAG, DL);
32757
32758 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32759 if (!VT.isVector()) {
32760 assert(
32761 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32762 "Only tested for i8/i16/i32/i64");
32763 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32764 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32765 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32766 DAG.getBitcast(MVT::v16i8, Res));
32767 Res =
32768 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32769 DAG.getVectorIdxConstant(0, DL));
32770 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32771 }
32772
32773 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32774
32775 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32776 if (VT.getScalarType() != MVT::i8) {
32777 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32778 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32779 Res = DAG.getBitcast(ByteVT, Res);
32780 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32781 return DAG.getBitcast(VT, Res);
32782 }
32783 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32784 "Only byte vector BITREVERSE supported");
32785
32786 unsigned NumElts = VT.getVectorNumElements();
32787
32788 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32789 if (Subtarget.hasGFNI()) {
32791 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32792 DAG.getTargetConstant(0, DL, MVT::i8));
32793 }
32794
32795 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32796 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32797 // 0-15 value (moved to the other nibble).
32798 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32799 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32800 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32801
32802 const int LoLUT[16] = {
32803 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32804 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32805 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32806 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32807 const int HiLUT[16] = {
32808 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32809 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32810 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32811 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32812
32813 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32814 for (unsigned i = 0; i < NumElts; ++i) {
32815 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32816 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32817 }
32818
32819 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32820 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32821 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32822 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32823 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32824}
32825
32826static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32827 SelectionDAG &DAG) {
32828 SDLoc DL(Op);
32829 SDValue X = Op.getOperand(0);
32830 MVT VT = Op.getSimpleValueType();
32831
32832 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32833 if (VT == MVT::i8 ||
32835 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32836 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32837 DAG.getConstant(0, DL, MVT::i8));
32838 // Copy the inverse of the parity flag into a register with setcc.
32839 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32840 // Extend to the original type.
32841 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32842 }
32843
32844 // If we have POPCNT, use the default expansion.
32845 if (Subtarget.hasPOPCNT())
32846 return SDValue();
32847
32848 if (VT == MVT::i64) {
32849 // Xor the high and low 16-bits together using a 32-bit operation.
32850 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32851 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32852 DAG.getConstant(32, DL, MVT::i8)));
32853 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32854 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32855 }
32856
32857 if (VT != MVT::i16) {
32858 // Xor the high and low 16-bits together using a 32-bit operation.
32859 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32860 DAG.getConstant(16, DL, MVT::i8));
32861 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32862 } else {
32863 // If the input is 16-bits, we need to extend to use an i32 shift below.
32864 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32865 }
32866
32867 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32868 // This should allow an h-reg to be used to save a shift.
32869 SDValue Hi = DAG.getNode(
32870 ISD::TRUNCATE, DL, MVT::i8,
32871 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32872 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32873 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32874 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32875
32876 // Copy the inverse of the parity flag into a register with setcc.
32877 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32878 // Extend to the original type.
32879 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32880}
32881
32883 const X86Subtarget &Subtarget) {
32884 unsigned NewOpc = 0;
32885 switch (N->getOpcode()) {
32886 case ISD::ATOMIC_LOAD_ADD:
32887 NewOpc = X86ISD::LADD;
32888 break;
32889 case ISD::ATOMIC_LOAD_SUB:
32890 NewOpc = X86ISD::LSUB;
32891 break;
32892 case ISD::ATOMIC_LOAD_OR:
32893 NewOpc = X86ISD::LOR;
32894 break;
32895 case ISD::ATOMIC_LOAD_XOR:
32896 NewOpc = X86ISD::LXOR;
32897 break;
32898 case ISD::ATOMIC_LOAD_AND:
32899 NewOpc = X86ISD::LAND;
32900 break;
32901 default:
32902 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32903 }
32904
32905 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32906
32907 return DAG.getMemIntrinsicNode(
32908 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32909 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32910 /*MemVT=*/N->getSimpleValueType(0), MMO);
32911}
32912
32913/// Lower atomic_load_ops into LOCK-prefixed operations.
32915 const X86Subtarget &Subtarget) {
32916 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32917 SDValue Chain = N->getOperand(0);
32918 SDValue LHS = N->getOperand(1);
32919 SDValue RHS = N->getOperand(2);
32920 unsigned Opc = N->getOpcode();
32921 MVT VT = N->getSimpleValueType(0);
32922 SDLoc DL(N);
32923
32924 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32925 // can only be lowered when the result is unused. They should have already
32926 // been transformed into a cmpxchg loop in AtomicExpand.
32927 if (N->hasAnyUseOfValue(0)) {
32928 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32929 // select LXADD if LOCK_SUB can't be selected.
32930 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32931 // can use LXADD as opposed to cmpxchg.
32932 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32933 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32934 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32935 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32936
32937 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32938 "Used AtomicRMW ops other than Add should have been expanded!");
32939 return N;
32940 }
32941
32942 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32943 // The core idea here is that since the memory location isn't actually
32944 // changing, all we need is a lowering for the *ordering* impacts of the
32945 // atomicrmw. As such, we can chose a different operation and memory
32946 // location to minimize impact on other code.
32947 // The above holds unless the node is marked volatile in which
32948 // case it needs to be preserved according to the langref.
32949 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32950 // On X86, the only ordering which actually requires an instruction is
32951 // seq_cst which isn't SingleThread, everything just needs to be preserved
32952 // during codegen and then dropped. Note that we expect (but don't assume),
32953 // that orderings other than seq_cst and acq_rel have been canonicalized to
32954 // a store or load.
32957 // Prefer a locked operation against a stack location to minimize cache
32958 // traffic. This assumes that stack locations are very likely to be
32959 // accessed only by the owning thread.
32960 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32961 assert(!N->hasAnyUseOfValue(0));
32962 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32963 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32964 DAG.getUNDEF(VT), NewChain);
32965 }
32966 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32967 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32968 assert(!N->hasAnyUseOfValue(0));
32969 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32970 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32971 DAG.getUNDEF(VT), NewChain);
32972 }
32973
32974 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32975 // RAUW the chain, but don't worry about the result, as it's unused.
32976 assert(!N->hasAnyUseOfValue(0));
32977 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32978 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32979 DAG.getUNDEF(VT), LockOp.getValue(1));
32980}
32981
32983 const X86Subtarget &Subtarget) {
32984 auto *Node = cast<AtomicSDNode>(Op.getNode());
32985 SDLoc dl(Node);
32986 EVT VT = Node->getMemoryVT();
32987
32988 bool IsSeqCst =
32989 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32990 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32991
32992 // If this store is not sequentially consistent and the type is legal
32993 // we can just keep it.
32994 if (!IsSeqCst && IsTypeLegal)
32995 return Op;
32996
32997 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32999 Attribute::NoImplicitFloat)) {
33000 SDValue Chain;
33001 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
33002 // vector store.
33003 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
33004 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
33005 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
33006 Node->getMemOperand());
33007 }
33008
33009 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33010 // is enabled.
33011 if (VT == MVT::i64) {
33012 if (Subtarget.hasSSE1()) {
33013 SDValue SclToVec =
33014 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33015 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33016 SclToVec = DAG.getBitcast(StVT, SclToVec);
33017 SDVTList Tys = DAG.getVTList(MVT::Other);
33018 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33019 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33020 MVT::i64, Node->getMemOperand());
33021 } else if (Subtarget.hasX87()) {
33022 // First load this into an 80-bit X87 register using a stack temporary.
33023 // This will put the whole integer into the significand.
33024 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33025 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33026 MachinePointerInfo MPI =
33028 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33030 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33031 SDValue LdOps[] = {Chain, StackPtr};
33033 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33034 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33035 Chain = Value.getValue(1);
33036
33037 // Now use an FIST to do the atomic store.
33038 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33039 Chain =
33040 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33041 StoreOps, MVT::i64, Node->getMemOperand());
33042 }
33043 }
33044
33045 if (Chain) {
33046 // If this is a sequentially consistent store, also emit an appropriate
33047 // barrier.
33048 if (IsSeqCst)
33049 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33050
33051 return Chain;
33052 }
33053 }
33054
33055 // Convert seq_cst store -> xchg
33056 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33057 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33058 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33059 Node->getOperand(0), Node->getOperand(2),
33060 Node->getOperand(1), Node->getMemOperand());
33061 return Swap.getValue(1);
33062}
33063
33065 SDNode *N = Op.getNode();
33066 MVT VT = N->getSimpleValueType(0);
33067 unsigned Opc = Op.getOpcode();
33068
33069 // Let legalize expand this if it isn't a legal type yet.
33070 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33071 return SDValue();
33072
33073 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33074 SDLoc DL(N);
33075
33076 // Set the carry flag.
33077 SDValue Carry = Op.getOperand(2);
33078 EVT CarryVT = Carry.getValueType();
33079 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33080 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33081
33082 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33083 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33084 Op.getOperand(0), Op.getOperand(1),
33085 Carry.getValue(1));
33086
33087 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33088 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33089 Sum.getValue(1), DL, DAG);
33090 if (N->getValueType(1) == MVT::i1)
33091 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33092
33093 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33094}
33095
33096static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33097 SelectionDAG &DAG) {
33098 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33099
33100 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33101 // which returns the values as { float, float } (in XMM0) or
33102 // { double, double } (which is returned in XMM0, XMM1).
33103 SDLoc dl(Op);
33104 SDValue Arg = Op.getOperand(0);
33105 EVT ArgVT = Arg.getValueType();
33106 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33107
33109 Args.emplace_back(Arg, ArgTy);
33110
33111 bool isF64 = ArgVT == MVT::f64;
33112 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33113 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33114 // the results are returned via SRet in memory.
33115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33116 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33117 const char *LibcallName = TLI.getLibcallName(LC);
33118 SDValue Callee =
33119 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33120
33121 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33122 : (Type *)FixedVectorType::get(ArgTy, 4);
33123
33125 CLI.setDebugLoc(dl)
33126 .setChain(DAG.getEntryNode())
33127 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33128
33129 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33130
33131 if (isF64)
33132 // Returned in xmm0 and xmm1.
33133 return CallResult.first;
33134
33135 // Returned in bits 0:31 and 32:64 xmm0.
33136 SDValue SinVal =
33137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33138 DAG.getVectorIdxConstant(0, dl));
33139 SDValue CosVal =
33140 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33141 DAG.getVectorIdxConstant(1, dl));
33142 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33143 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33144}
33145
33146/// Widen a vector input to a vector of NVT. The
33147/// input vector must have the same element type as NVT.
33149 bool FillWithZeroes = false) {
33150 // Check if InOp already has the right width.
33151 MVT InVT = InOp.getSimpleValueType();
33152 if (InVT == NVT)
33153 return InOp;
33154
33155 if (InOp.isUndef())
33156 return DAG.getUNDEF(NVT);
33157
33159 "input and widen element type must match");
33160
33161 unsigned InNumElts = InVT.getVectorNumElements();
33162 unsigned WidenNumElts = NVT.getVectorNumElements();
33163 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33164 "Unexpected request for vector widening");
33165
33166 SDLoc dl(InOp);
33167 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33168 SDValue N1 = InOp.getOperand(1);
33169 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33170 N1.isUndef()) {
33171 InOp = InOp.getOperand(0);
33172 InVT = InOp.getSimpleValueType();
33173 InNumElts = InVT.getVectorNumElements();
33174 }
33175 }
33178 EVT EltVT = InOp.getOperand(0).getValueType();
33179 SDValue FillVal =
33180 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33182 Ops.append(WidenNumElts - InNumElts, FillVal);
33183 return DAG.getBuildVector(NVT, dl, Ops);
33184 }
33185 SDValue FillVal =
33186 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33187 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33188 DAG.getVectorIdxConstant(0, dl));
33189}
33190
33192 SelectionDAG &DAG) {
33193 assert(Subtarget.hasAVX512() &&
33194 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33195
33197 SDValue Src = N->getValue();
33198 MVT VT = Src.getSimpleValueType();
33199 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33200 SDLoc dl(Op);
33201
33202 SDValue Scale = N->getScale();
33203 SDValue Index = N->getIndex();
33204 SDValue Mask = N->getMask();
33205 SDValue Chain = N->getChain();
33206 SDValue BasePtr = N->getBasePtr();
33207
33208 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33209 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33210 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33211 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33212 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33213 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33214 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33215 SDVTList VTs = DAG.getVTList(MVT::Other);
33216 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33217 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33218 N->getMemoryVT(), N->getMemOperand());
33219 }
33220 return SDValue();
33221 }
33222
33223 MVT IndexVT = Index.getSimpleValueType();
33224
33225 // If the index is v2i32, we're being called by type legalization and we
33226 // should just let the default handling take care of it.
33227 if (IndexVT == MVT::v2i32)
33228 return SDValue();
33229
33230 // If we don't have VLX and neither the passthru or index is 512-bits, we
33231 // need to widen until one is.
33232 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33233 !Index.getSimpleValueType().is512BitVector()) {
33234 // Determine how much we need to widen by to get a 512-bit type.
33235 unsigned Factor = std::min(512/VT.getSizeInBits(),
33236 512/IndexVT.getSizeInBits());
33237 unsigned NumElts = VT.getVectorNumElements() * Factor;
33238
33239 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33240 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33241 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33242
33243 Src = ExtendToType(Src, VT, DAG);
33244 Index = ExtendToType(Index, IndexVT, DAG);
33245 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33246 }
33247
33248 SDVTList VTs = DAG.getVTList(MVT::Other);
33249 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33250 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33251 N->getMemoryVT(), N->getMemOperand());
33252}
33253
33254static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33255 SelectionDAG &DAG) {
33256
33258 MVT VT = Op.getSimpleValueType();
33259 MVT ScalarVT = VT.getScalarType();
33260 SDValue Mask = N->getMask();
33261 MVT MaskVT = Mask.getSimpleValueType();
33262 SDValue PassThru = N->getPassThru();
33263 SDLoc dl(Op);
33264
33265 // Handle AVX masked loads which don't support passthru other than 0.
33266 if (MaskVT.getVectorElementType() != MVT::i1) {
33267 // We also allow undef in the isel pattern.
33268 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33269 return Op;
33270
33271 SDValue NewLoad = DAG.getMaskedLoad(
33272 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33273 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33274 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33275 N->isExpandingLoad());
33276 // Emit a blend.
33277 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33278 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33279 }
33280
33281 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33282 "Expanding masked load is supported on AVX-512 target only!");
33283
33284 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33285 "Expanding masked load is supported for 32 and 64-bit types only!");
33286
33287 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33288 "Cannot lower masked load op.");
33289
33290 assert((ScalarVT.getSizeInBits() >= 32 ||
33291 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33292 ScalarVT == MVT::f16))) &&
33293 "Unsupported masked load op.");
33294
33295 // This operation is legal for targets with VLX, but without
33296 // VLX the vector should be widened to 512 bit
33297 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33298 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33299 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33300
33301 // Mask element has to be i1.
33302 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33303 "Unexpected mask type");
33304
33305 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33306
33307 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33308 SDValue NewLoad = DAG.getMaskedLoad(
33309 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33310 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33311 N->getExtensionType(), N->isExpandingLoad());
33312
33313 SDValue Extract =
33314 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33315 DAG.getVectorIdxConstant(0, dl));
33316 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33317 return DAG.getMergeValues(RetOps, dl);
33318}
33319
33320static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33321 SelectionDAG &DAG) {
33323 SDValue DataToStore = N->getValue();
33324 MVT VT = DataToStore.getSimpleValueType();
33325 MVT ScalarVT = VT.getScalarType();
33326 SDValue Mask = N->getMask();
33327 SDLoc dl(Op);
33328
33329 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33330 "Expanding masked load is supported on AVX-512 target only!");
33331
33332 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33333 "Expanding masked load is supported for 32 and 64-bit types only!");
33334
33335 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33336 "Cannot lower masked store op.");
33337
33338 assert((ScalarVT.getSizeInBits() >= 32 ||
33339 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33340 ScalarVT == MVT::f16))) &&
33341 "Unsupported masked store op.");
33342
33343 // This operation is legal for targets with VLX, but without
33344 // VLX the vector should be widened to 512 bit
33345 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33346 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33347
33348 // Mask element has to be i1.
33349 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33350 "Unexpected mask type");
33351
33352 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33353
33354 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33355 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33356 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33357 N->getOffset(), Mask, N->getMemoryVT(),
33358 N->getMemOperand(), N->getAddressingMode(),
33359 N->isTruncatingStore(), N->isCompressingStore());
33360}
33361
33362static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33363 SelectionDAG &DAG) {
33364 assert(Subtarget.hasAVX2() &&
33365 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33366
33368 SDLoc dl(Op);
33369 MVT VT = Op.getSimpleValueType();
33370 SDValue Index = N->getIndex();
33371 SDValue Mask = N->getMask();
33372 SDValue PassThru = N->getPassThru();
33373 MVT IndexVT = Index.getSimpleValueType();
33374
33375 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33376
33377 // If the index is v2i32, we're being called by type legalization.
33378 if (IndexVT == MVT::v2i32)
33379 return SDValue();
33380
33381 // If we don't have VLX and neither the passthru or index is 512-bits, we
33382 // need to widen until one is.
33383 MVT OrigVT = VT;
33384 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33385 !IndexVT.is512BitVector()) {
33386 // Determine how much we need to widen by to get a 512-bit type.
33387 unsigned Factor = std::min(512/VT.getSizeInBits(),
33388 512/IndexVT.getSizeInBits());
33389
33390 unsigned NumElts = VT.getVectorNumElements() * Factor;
33391
33392 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33393 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33394 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33395
33396 PassThru = ExtendToType(PassThru, VT, DAG);
33397 Index = ExtendToType(Index, IndexVT, DAG);
33398 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33399 }
33400
33401 // Break dependency on the data register.
33402 if (PassThru.isUndef())
33403 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33404
33405 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33406 N->getScale() };
33407 SDValue NewGather = DAG.getMemIntrinsicNode(
33408 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33409 N->getMemOperand());
33410 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33411 DAG.getVectorIdxConstant(0, dl));
33412 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33413}
33414
33416 SDLoc dl(Op);
33417 SDValue Src = Op.getOperand(0);
33418 MVT DstVT = Op.getSimpleValueType();
33419
33421 unsigned SrcAS = N->getSrcAddressSpace();
33422
33423 assert(SrcAS != N->getDestAddressSpace() &&
33424 "addrspacecast must be between different address spaces");
33425
33426 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33427 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33428 } else if (DstVT == MVT::i64) {
33429 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33430 } else if (DstVT == MVT::i32) {
33431 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33432 } else {
33433 report_fatal_error("Bad address space in addrspacecast");
33434 }
33435 return Op;
33436}
33437
33438SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33439 SelectionDAG &DAG) const {
33440 // TODO: Eventually, the lowering of these nodes should be informed by or
33441 // deferred to the GC strategy for the function in which they appear. For
33442 // now, however, they must be lowered to something. Since they are logically
33443 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33444 // require special handling for these nodes), lower them as literal NOOPs for
33445 // the time being.
33447 Ops.push_back(Op.getOperand(0));
33448 if (Op->getGluedNode())
33449 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33450
33451 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33452 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33453}
33454
33455// Custom split CVTPS2PH with wide types.
33457 SDLoc dl(Op);
33458 EVT VT = Op.getValueType();
33459 SDValue Lo, Hi;
33460 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33461 EVT LoVT, HiVT;
33462 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33463 SDValue RC = Op.getOperand(1);
33464 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33465 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33466 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33467}
33468
33470 SelectionDAG &DAG) {
33471 unsigned IsData = Op.getConstantOperandVal(4);
33472
33473 // We don't support non-data prefetch without PREFETCHI.
33474 // Just preserve the chain.
33475 if (!IsData && !Subtarget.hasPREFETCHI())
33476 return Op.getOperand(0);
33477
33478 return Op;
33479}
33480
33482 SDNode *N = Op.getNode();
33483 SDValue Operand = N->getOperand(0);
33484 EVT VT = Operand.getValueType();
33485 SDLoc dl(N);
33486
33487 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33488
33489 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33490 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33491 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33492 // promote this operator's result!
33493 SDValue Chain = DAG.getEntryNode();
33494 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33495 {Chain, Operand, One});
33496 return StrictFmul;
33497}
33498
33500 unsigned OpNo) {
33501 const APInt Operand(32, OpNo);
33502 std::string OpNoStr = llvm::toString(Operand, 10, false);
33503 std::string Str(" $");
33504
33505 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33506 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33507
33508 auto I = StringRef::npos;
33509 for (auto &AsmStr : AsmStrs) {
33510 // Match the OpNo string. We should match exactly to exclude match
33511 // sub-string, e.g. "$12" contain "$1"
33512 if (AsmStr.ends_with(OpNoStr1))
33513 I = AsmStr.size() - OpNoStr1.size();
33514
33515 // Get the index of operand in AsmStr.
33516 if (I == StringRef::npos)
33517 I = AsmStr.find(OpNoStr1 + ",");
33518 if (I == StringRef::npos)
33519 I = AsmStr.find(OpNoStr2);
33520
33521 if (I == StringRef::npos)
33522 continue;
33523
33524 assert(I > 0 && "Unexpected inline asm string!");
33525 // Remove the operand string and label (if exsit).
33526 // For example:
33527 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33528 // ==>
33529 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33530 // ==>
33531 // "call dword ptr "
33532 auto TmpStr = AsmStr.substr(0, I);
33533 I = TmpStr.rfind(':');
33534 if (I != StringRef::npos)
33535 TmpStr = TmpStr.substr(I + 1);
33536 return TmpStr.take_while(llvm::isAlpha);
33537 }
33538
33539 return StringRef();
33540}
33541
33543 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33544 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33545 // changed from indirect TargetLowering::C_Memory to direct
33546 // TargetLowering::C_Address.
33547 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33548 // location.
33549 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33550 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33551}
33552
33554 SDValue Mask) {
33555 EVT Ty = MVT::i8;
33556 auto V = DAG.getBitcast(MVT::i1, Mask);
33557 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33558 auto Zero = DAG.getConstant(0, DL, Ty);
33559 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33560 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33561 return SDValue(CmpZero.getNode(), 1);
33562}
33563
33565 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33566 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33567 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33568 // ->
33569 // _, flags = SUB 0, mask
33570 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33571 // bit_cast_to_vector<res>
33572 EVT VTy = PassThru.getValueType();
33573 EVT Ty = VTy.getVectorElementType();
33574 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33575 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33576 : DAG.getBitcast(Ty, PassThru);
33577 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33578 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33579 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33580 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33581 return DAG.getBitcast(VTy, NewLoad);
33582}
33583
33585 SDValue Chain,
33587 SDValue Val, SDValue Mask) const {
33588 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33589 // ->
33590 // _, flags = SUB 0, mask
33591 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33593 SDVTList Tys = DAG.getVTList(MVT::Other);
33594 auto ScalarVal = DAG.getBitcast(Ty, Val);
33595 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33596 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33597 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33598 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33599}
33600
33601/// Provide custom lowering hooks for some operations.
33603 switch (Op.getOpcode()) {
33604 // clang-format off
33605 default: llvm_unreachable("Should not custom lower this!");
33606 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33607 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33608 return LowerCMP_SWAP(Op, Subtarget, DAG);
33609 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33610 case ISD::ATOMIC_LOAD_ADD:
33611 case ISD::ATOMIC_LOAD_SUB:
33612 case ISD::ATOMIC_LOAD_OR:
33613 case ISD::ATOMIC_LOAD_XOR:
33614 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33615 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33616 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33617 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33618 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33619 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33620 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33621 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33622 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33623 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33624 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33625 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33626 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33627 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33628 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33629 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33630 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33631 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33632 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33633 case ISD::SHL_PARTS:
33634 case ISD::SRA_PARTS:
33635 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33636 case ISD::FSHL:
33637 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33638 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33640 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33642 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33643 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33644 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33645 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33646 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33649 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33650 case ISD::FP_TO_SINT:
33652 case ISD::FP_TO_UINT:
33653 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33655 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33656 case ISD::FP_EXTEND:
33657 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33658 case ISD::FP_ROUND:
33659 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33660 case ISD::FP16_TO_FP:
33661 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33662 case ISD::FP_TO_FP16:
33663 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33664 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33665 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33666 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33667 case ISD::FADD:
33668 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33669 case ISD::FROUND: return LowerFROUND(Op, DAG);
33670 case ISD::FABS:
33671 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33672 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33673 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33674 case ISD::LRINT:
33675 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33676 case ISD::SETCC:
33677 case ISD::STRICT_FSETCC:
33678 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33679 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33680 case ISD::SELECT: return LowerSELECT(Op, DAG);
33681 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33682 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33683 case ISD::VASTART: return LowerVASTART(Op, DAG);
33684 case ISD::VAARG: return LowerVAARG(Op, DAG);
33685 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33686 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33688 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33689 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33690 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33691 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33693 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33694 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33695 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33696 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33697 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33699 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33700 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33701 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33702 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33703 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33704 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33705 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33706 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33707 case ISD::CTLZ:
33708 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33709 case ISD::CTTZ:
33710 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33711 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33712 case ISD::MULHS:
33713 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33714 case ISD::ROTL:
33715 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33716 case ISD::SRA:
33717 case ISD::SRL:
33718 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33719 case ISD::SADDO:
33720 case ISD::UADDO:
33721 case ISD::SSUBO:
33722 case ISD::USUBO: return LowerXALUO(Op, DAG);
33723 case ISD::SMULO:
33724 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33725 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33726 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33727 case ISD::SADDO_CARRY:
33728 case ISD::SSUBO_CARRY:
33729 case ISD::UADDO_CARRY:
33730 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33731 case ISD::ADD:
33732 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33733 case ISD::UADDSAT:
33734 case ISD::SADDSAT:
33735 case ISD::USUBSAT:
33736 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33737 case ISD::SMAX:
33738 case ISD::SMIN:
33739 case ISD::UMAX:
33740 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33741 case ISD::FMINIMUM:
33742 case ISD::FMAXIMUM:
33743 case ISD::FMINIMUMNUM:
33744 case ISD::FMAXIMUMNUM:
33745 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33746 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33747 case ISD::ABDS:
33748 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33749 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33750 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33751 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33752 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33753 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33754 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33755 case ISD::GC_TRANSITION_START:
33756 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33757 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33758 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33759 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33760 // clang-format on
33761 }
33762}
33763
33764/// Replace a node with an illegal result type with a new node built out of
33765/// custom code.
33768 SelectionDAG &DAG) const {
33769 SDLoc dl(N);
33770 unsigned Opc = N->getOpcode();
33771 switch (Opc) {
33772 default:
33773#ifndef NDEBUG
33774 dbgs() << "ReplaceNodeResults: ";
33775 N->dump(&DAG);
33776#endif
33777 llvm_unreachable("Do not know how to custom type legalize this operation!");
33778 case X86ISD::CVTPH2PS: {
33779 EVT VT = N->getValueType(0);
33780 SDValue Lo, Hi;
33781 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33782 EVT LoVT, HiVT;
33783 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33784 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33785 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33786 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33787 Results.push_back(Res);
33788 return;
33789 }
33791 EVT VT = N->getValueType(0);
33792 SDValue Lo, Hi;
33793 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33794 EVT LoVT, HiVT;
33795 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33796 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33797 {N->getOperand(0), Lo});
33798 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33799 {N->getOperand(0), Hi});
33800 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33801 Lo.getValue(1), Hi.getValue(1));
33802 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33803 Results.push_back(Res);
33804 Results.push_back(Chain);
33805 return;
33806 }
33807 case X86ISD::CVTPS2PH:
33808 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33809 return;
33810 case ISD::CTPOP: {
33811 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33812 // If we have at most 32 active bits, then perform as i32 CTPOP.
33813 // TODO: Perform this in generic legalizer?
33814 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33815 unsigned LZ = Known.countMinLeadingZeros();
33816 unsigned TZ = Known.countMinTrailingZeros();
33817 if ((LZ + TZ) >= 32) {
33818 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33819 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33820 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33821 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33822 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33823 Results.push_back(Op);
33824 return;
33825 }
33826 // Use a v2i64 if possible.
33827 bool NoImplicitFloatOps =
33829 Attribute::NoImplicitFloat);
33830 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33831 SDValue Wide =
33832 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33833 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33834 // Bit count should fit in 32-bits, extract it as that and then zero
33835 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33836 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33837 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33838 DAG.getVectorIdxConstant(0, dl));
33839 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33840 Results.push_back(Wide);
33841 }
33842 return;
33843 }
33844 case ISD::MUL: {
33845 EVT VT = N->getValueType(0);
33847 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33848 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33849 // elements are needed.
33850 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33851 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33852 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33853 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33854 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33855 unsigned NumConcats = 16 / VT.getVectorNumElements();
33856 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33857 ConcatOps[0] = Res;
33858 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33859 Results.push_back(Res);
33860 return;
33861 }
33862 case ISD::SMULO:
33863 case ISD::UMULO: {
33864 EVT VT = N->getValueType(0);
33866 VT == MVT::v2i32 && "Unexpected VT!");
33867 bool IsSigned = Opc == ISD::SMULO;
33868 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33869 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33870 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33871 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33872 // Extract the high 32 bits from each result using PSHUFD.
33873 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33874 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33875 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33876 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33877 DAG.getVectorIdxConstant(0, dl));
33878
33879 // Truncate the low bits of the result. This will become PSHUFD.
33880 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33881
33882 SDValue HiCmp;
33883 if (IsSigned) {
33884 // SMULO overflows if the high bits don't match the sign of the low.
33885 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33886 } else {
33887 // UMULO overflows if the high bits are non-zero.
33888 HiCmp = DAG.getConstant(0, dl, VT);
33889 }
33890 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33891
33892 // Widen the result with by padding with undef.
33893 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33894 DAG.getUNDEF(VT));
33895 Results.push_back(Res);
33896 Results.push_back(Ovf);
33897 return;
33898 }
33899 case X86ISD::VPMADDWD: {
33900 // Legalize types for X86ISD::VPMADDWD by widening.
33901 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33902
33903 EVT VT = N->getValueType(0);
33904 EVT InVT = N->getOperand(0).getValueType();
33905 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33906 "Expected a VT that divides into 128 bits.");
33908 "Unexpected type action!");
33909 unsigned NumConcat = 128 / InVT.getSizeInBits();
33910
33911 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33912 InVT.getVectorElementType(),
33913 NumConcat * InVT.getVectorNumElements());
33914 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33916 NumConcat * VT.getVectorNumElements());
33917
33918 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33919 Ops[0] = N->getOperand(0);
33920 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33921 Ops[0] = N->getOperand(1);
33922 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33923
33924 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33925 Results.push_back(Res);
33926 return;
33927 }
33928 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33929 case X86ISD::FMINC:
33930 case X86ISD::FMIN:
33931 case X86ISD::FMAXC:
33932 case X86ISD::FMAX:
33934 case X86ISD::STRICT_FMAX: {
33935 EVT VT = N->getValueType(0);
33936 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33937 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33938 SDValue UNDEF = DAG.getUNDEF(VT);
33939 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33940 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33941 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33942 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33943 SDValue Res;
33944 if (IsStrict)
33945 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33946 {N->getOperand(0), LHS, RHS});
33947 else
33948 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33949 Results.push_back(Res);
33950 if (IsStrict)
33951 Results.push_back(Res.getValue(1));
33952 return;
33953 }
33954 case ISD::SDIV:
33955 case ISD::UDIV:
33956 case ISD::SREM:
33957 case ISD::UREM: {
33958 EVT VT = N->getValueType(0);
33959 if (VT.isVector()) {
33961 "Unexpected type action!");
33962 // If this RHS is a constant splat vector we can widen this and let
33963 // division/remainder by constant optimize it.
33964 // TODO: Can we do something for non-splat?
33965 APInt SplatVal;
33966 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33967 unsigned NumConcats = 128 / VT.getSizeInBits();
33968 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33969 Ops0[0] = N->getOperand(0);
33970 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33971 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33972 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33973 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33974 Results.push_back(Res);
33975 }
33976 return;
33977 }
33978
33979 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33980 Results.push_back(V);
33981 return;
33982 }
33983 case ISD::TRUNCATE: {
33984 MVT VT = N->getSimpleValueType(0);
33985 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33986 return;
33987
33988 // The generic legalizer will try to widen the input type to the same
33989 // number of elements as the widened result type. But this isn't always
33990 // the best thing so do some custom legalization to avoid some cases.
33991 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33992 SDValue In = N->getOperand(0);
33993 EVT InVT = In.getValueType();
33994 EVT InEltVT = InVT.getVectorElementType();
33995 EVT EltVT = VT.getVectorElementType();
33996 unsigned MinElts = VT.getVectorNumElements();
33997 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33998 unsigned InBits = InVT.getSizeInBits();
33999
34000 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
34001 unsigned PackOpcode;
34002 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
34003 Subtarget, N->getFlags())) {
34004 if (SDValue Res =
34005 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
34006 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
34007 Results.push_back(Res);
34008 return;
34009 }
34010 }
34011
34012 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34013 // 128 bit and smaller inputs should avoid truncate all together and
34014 // use a shuffle.
34015 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34016 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34017 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34018 for (unsigned I = 0; I < MinElts; ++I)
34019 TruncMask[I] = Scale * I;
34020 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34021 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34022 "Illegal vector type in truncation");
34023 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34024 Results.push_back(
34025 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34026 return;
34027 }
34028 }
34029
34030 // With AVX512 there are some cases that can use a target specific
34031 // truncate node to go from 256/512 to less than 128 with zeros in the
34032 // upper elements of the 128 bit result.
34033 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34034 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34035 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34036 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34037 return;
34038 }
34039 // There's one case we can widen to 512 bits and use VTRUNC.
34040 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34041 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34042 DAG.getUNDEF(MVT::v4i64));
34043 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34044 return;
34045 }
34046 }
34047 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34048 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34049 isTypeLegal(MVT::v4i64)) {
34050 // Input needs to be split and output needs to widened. Let's use two
34051 // VTRUNCs, and shuffle their results together into the wider type.
34052 SDValue Lo, Hi;
34053 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34054
34055 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34056 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34057 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34058 { 0, 1, 2, 3, 16, 17, 18, 19,
34059 -1, -1, -1, -1, -1, -1, -1, -1 });
34060 Results.push_back(Res);
34061 return;
34062 }
34063
34064 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34065 // this via type legalization.
34066 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34067 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34068 (!Subtarget.hasSSSE3() ||
34069 (!isTypeLegal(InVT) &&
34070 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34071 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34072 InEltVT.getSizeInBits() * WidenNumElts);
34073 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34074 return;
34075 }
34076
34077 return;
34078 }
34079 case ISD::ANY_EXTEND:
34080 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34081 // It's intended to custom handle the input type.
34082 assert(N->getValueType(0) == MVT::v8i8 &&
34083 "Do not know how to legalize this Node");
34084 return;
34085 case ISD::SIGN_EXTEND:
34086 case ISD::ZERO_EXTEND: {
34087 EVT VT = N->getValueType(0);
34088 SDValue In = N->getOperand(0);
34089 EVT InVT = In.getValueType();
34090 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34091 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34093 "Unexpected type action!");
34094 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34095 // Custom split this so we can extend i8/i16->i32 invec. This is better
34096 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34097 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34098 // we allow the sra from the extend to i32 to be shared by the split.
34099 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34100
34101 // Fill a vector with sign bits for each element.
34102 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34103 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34104
34105 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34106 // to v2i64.
34107 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34108 {0, 4, 1, 5});
34109 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34110 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34111 {2, 6, 3, 7});
34112 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34113
34114 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34115 Results.push_back(Res);
34116 return;
34117 }
34118
34119 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34120 if (!InVT.is128BitVector()) {
34121 // Not a 128 bit vector, but maybe type legalization will promote
34122 // it to 128 bits.
34123 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34124 return;
34125 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34126 if (!InVT.is128BitVector())
34127 return;
34128
34129 // Promote the input to 128 bits. Type legalization will turn this into
34130 // zext_inreg/sext_inreg.
34131 In = DAG.getNode(Opc, dl, InVT, In);
34132 }
34133
34134 // Perform custom splitting instead of the two stage extend we would get
34135 // by default.
34136 EVT LoVT, HiVT;
34137 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34138 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34139
34140 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34141
34142 // We need to shift the input over by half the number of elements.
34143 unsigned NumElts = InVT.getVectorNumElements();
34144 unsigned HalfNumElts = NumElts / 2;
34145 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34146 for (unsigned i = 0; i != HalfNumElts; ++i)
34147 ShufMask[i] = i + HalfNumElts;
34148
34149 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34150 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34151
34152 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34153 Results.push_back(Res);
34154 }
34155 return;
34156 }
34158 case ISD::FP_TO_UINT_SAT: {
34159 if (!Subtarget.hasAVX10_2())
34160 return;
34161
34162 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34163 EVT VT = N->getValueType(0);
34164 SDValue Op = N->getOperand(0);
34165 EVT OpVT = Op.getValueType();
34166 SDValue Res;
34167
34168 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34169 if (IsSigned)
34170 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34171 else
34172 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34173 Results.push_back(Res);
34174 }
34175 return;
34176 }
34177 case ISD::FP_TO_SINT:
34179 case ISD::FP_TO_UINT:
34181 bool IsStrict = N->isStrictFPOpcode();
34182 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34183 EVT VT = N->getValueType(0);
34184 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34185 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34186 EVT SrcVT = Src.getValueType();
34187
34188 SDValue Res;
34189 if (isSoftF16(SrcVT, Subtarget)) {
34190 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34191 if (IsStrict) {
34192 Res =
34193 DAG.getNode(Opc, dl, {VT, MVT::Other},
34194 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34195 {NVT, MVT::Other}, {Chain, Src})});
34196 Chain = Res.getValue(1);
34197 } else {
34198 Res =
34199 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34200 }
34201 Results.push_back(Res);
34202 if (IsStrict)
34203 Results.push_back(Chain);
34204
34205 return;
34206 }
34207
34208 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34209 SrcVT.getVectorElementType() == MVT::f16) {
34210 EVT EleVT = VT.getVectorElementType();
34211 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34212
34213 if (SrcVT != MVT::v8f16) {
34214 SDValue Tmp =
34215 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34216 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34217 Ops[0] = Src;
34218 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34219 }
34220
34221 if (IsStrict) {
34223 Res =
34224 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34225 Chain = Res.getValue(1);
34226 } else {
34227 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34228 Res = DAG.getNode(Opc, dl, ResVT, Src);
34229 }
34230
34231 // TODO: Need to add exception check code for strict FP.
34232 if (EleVT.getSizeInBits() < 16) {
34233 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34234 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34235
34236 // Now widen to 128 bits.
34237 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34238 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34239 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34240 ConcatOps[0] = Res;
34241 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34242 }
34243
34244 Results.push_back(Res);
34245 if (IsStrict)
34246 Results.push_back(Chain);
34247
34248 return;
34249 }
34250
34251 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34253 "Unexpected type action!");
34254
34255 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34256 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34257 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34259 SDValue Res;
34260 SDValue Chain;
34261 if (IsStrict) {
34262 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34263 {N->getOperand(0), Src});
34264 Chain = Res.getValue(1);
34265 } else
34266 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34267
34268 // Preserve what we know about the size of the original result. If the
34269 // result is v2i32, we have to manually widen the assert.
34270 if (PromoteVT == MVT::v2i32)
34271 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34272 DAG.getUNDEF(MVT::v2i32));
34273
34274 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34275 Res.getValueType(), Res,
34277
34278 if (PromoteVT == MVT::v2i32)
34279 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34280 DAG.getVectorIdxConstant(0, dl));
34281
34282 // Truncate back to the original width.
34283 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34284
34285 // Now widen to 128 bits.
34286 unsigned NumConcats = 128 / VT.getSizeInBits();
34288 VT.getVectorNumElements() * NumConcats);
34289 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34290 ConcatOps[0] = Res;
34291 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34292 Results.push_back(Res);
34293 if (IsStrict)
34294 Results.push_back(Chain);
34295 return;
34296 }
34297
34298
34299 if (VT == MVT::v2i32) {
34300 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34301 "Strict unsigned conversion requires AVX512");
34302 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34304 "Unexpected type action!");
34305 if (Src.getValueType() == MVT::v2f64) {
34306 if (!IsSigned && !Subtarget.hasAVX512()) {
34307 SDValue Res =
34308 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34309 Results.push_back(Res);
34310 return;
34311 }
34312
34313 if (IsStrict)
34315 else
34316 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34317
34318 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34319 if (!IsSigned && !Subtarget.hasVLX()) {
34320 // Otherwise we can defer to the generic legalizer which will widen
34321 // the input as well. This will be further widened during op
34322 // legalization to v8i32<-v8f64.
34323 // For strict nodes we'll need to widen ourselves.
34324 // FIXME: Fix the type legalizer to safely widen strict nodes?
34325 if (!IsStrict)
34326 return;
34327 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34328 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34329 Opc = N->getOpcode();
34330 }
34331 SDValue Res;
34332 SDValue Chain;
34333 if (IsStrict) {
34334 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34335 {N->getOperand(0), Src});
34336 Chain = Res.getValue(1);
34337 } else {
34338 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34339 }
34340 Results.push_back(Res);
34341 if (IsStrict)
34342 Results.push_back(Chain);
34343 return;
34344 }
34345
34346 // Custom widen strict v2f32->v2i32 by padding with zeros.
34347 // FIXME: Should generic type legalizer do this?
34348 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34349 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34350 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34351 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34352 {N->getOperand(0), Src});
34353 Results.push_back(Res);
34354 Results.push_back(Res.getValue(1));
34355 return;
34356 }
34357
34358 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34359 // so early out here.
34360 return;
34361 }
34362
34363 assert(!VT.isVector() && "Vectors should have been handled above!");
34364
34365 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34366 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34367 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34368 assert(!Subtarget.is64Bit() && "i64 should be legal");
34369 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34370 // If we use a 128-bit result we might need to use a target specific node.
34371 unsigned SrcElts =
34372 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34373 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34374 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34375 if (NumElts != SrcElts) {
34376 if (IsStrict)
34378 else
34379 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34380 }
34381
34382 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34383 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34384 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34385 ZeroIdx);
34386 SDValue Chain;
34387 if (IsStrict) {
34388 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34389 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34390 Chain = Res.getValue(1);
34391 } else
34392 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34393 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34394 Results.push_back(Res);
34395 if (IsStrict)
34396 Results.push_back(Chain);
34397 return;
34398 }
34399
34400 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34401 SDValue Chain;
34402 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34403 Results.push_back(V);
34404 if (IsStrict)
34405 Results.push_back(Chain);
34406 return;
34407 }
34408
34409 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34410 Results.push_back(V);
34411 if (IsStrict)
34412 Results.push_back(Chain);
34413 }
34414 return;
34415 }
34416 case ISD::LRINT:
34417 if (N->getValueType(0) == MVT::v2i32) {
34418 SDValue Src = N->getOperand(0);
34419 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34420 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34421 DAG.getUNDEF(MVT::v2f16));
34422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34423 DAG.getUNDEF(MVT::v4f16));
34424 } else if (Src.getValueType() != MVT::v2f64) {
34425 return;
34426 }
34427 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34428 return;
34429 }
34430 [[fallthrough]];
34431 case ISD::LLRINT: {
34432 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34433 Results.push_back(V);
34434 return;
34435 }
34436
34437 case ISD::SINT_TO_FP:
34439 case ISD::UINT_TO_FP:
34441 bool IsStrict = N->isStrictFPOpcode();
34442 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34443 EVT VT = N->getValueType(0);
34444 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34445 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34446 Subtarget.hasVLX()) {
34447 if (Src.getValueType().getVectorElementType() == MVT::i16)
34448 return;
34449
34450 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34451 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34452 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34453 : DAG.getUNDEF(MVT::v2i32));
34454 if (IsStrict) {
34455 unsigned Opc =
34457 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34458 {N->getOperand(0), Src});
34459 Results.push_back(Res);
34460 Results.push_back(Res.getValue(1));
34461 } else {
34462 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34463 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34464 }
34465 return;
34466 }
34467 if (VT != MVT::v2f32)
34468 return;
34469 EVT SrcVT = Src.getValueType();
34470 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34471 if (IsStrict) {
34472 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34474 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34475 {N->getOperand(0), Src});
34476 Results.push_back(Res);
34477 Results.push_back(Res.getValue(1));
34478 } else {
34479 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34480 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34481 }
34482 return;
34483 }
34484 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34485 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34486 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34487 SDValue One = DAG.getConstant(1, dl, SrcVT);
34488 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34489 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34490 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34491 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34492 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34493 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34494 for (int i = 0; i != 2; ++i) {
34495 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34496 SignSrc, DAG.getVectorIdxConstant(i, dl));
34497 if (IsStrict)
34498 SignCvts[i] =
34499 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34500 {N->getOperand(0), Elt});
34501 else
34502 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34503 };
34504 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34505 SDValue Slow, Chain;
34506 if (IsStrict) {
34507 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34508 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34509 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34510 {Chain, SignCvt, SignCvt});
34511 Chain = Slow.getValue(1);
34512 } else {
34513 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34514 }
34515 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34516 IsNeg =
34517 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34518 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34519 Results.push_back(Cvt);
34520 if (IsStrict)
34521 Results.push_back(Chain);
34522 return;
34523 }
34524
34525 if (SrcVT != MVT::v2i32)
34526 return;
34527
34528 if (IsSigned || Subtarget.hasAVX512()) {
34529 if (!IsStrict)
34530 return;
34531
34532 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34533 // FIXME: Should generic type legalizer do this?
34534 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34535 DAG.getConstant(0, dl, MVT::v2i32));
34536 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34537 {N->getOperand(0), Src});
34538 Results.push_back(Res);
34539 Results.push_back(Res.getValue(1));
34540 return;
34541 }
34542
34543 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34544 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34545 SDValue VBias = DAG.getConstantFP(
34546 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34547 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34548 DAG.getBitcast(MVT::v2i64, VBias));
34549 Or = DAG.getBitcast(MVT::v2f64, Or);
34550 if (IsStrict) {
34551 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34552 {N->getOperand(0), Or, VBias});
34554 {MVT::v4f32, MVT::Other},
34555 {Sub.getValue(1), Sub});
34556 Results.push_back(Res);
34557 Results.push_back(Res.getValue(1));
34558 } else {
34559 // TODO: Are there any fast-math-flags to propagate here?
34560 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34561 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34562 }
34563 return;
34564 }
34566 case ISD::FP_ROUND: {
34567 bool IsStrict = N->isStrictFPOpcode();
34568 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34569 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34570 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34571 EVT SrcVT = Src.getValueType();
34572 EVT VT = N->getValueType(0);
34573 SDValue V;
34574 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34575 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34576 : DAG.getUNDEF(MVT::v2f32);
34577 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34578 }
34579 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34580 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34581 if (SrcVT.getVectorElementType() != MVT::f32)
34582 return;
34583
34584 if (IsStrict)
34585 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34586 {Chain, Src, Rnd});
34587 else
34588 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34589
34590 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34591 if (IsStrict)
34592 Results.push_back(V.getValue(1));
34593 return;
34594 }
34595 if (!isTypeLegal(Src.getValueType()))
34596 return;
34597 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34598 if (IsStrict)
34599 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34600 {Chain, Src});
34601 else
34602 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34603 Results.push_back(V);
34604 if (IsStrict)
34605 Results.push_back(V.getValue(1));
34606 return;
34607 }
34608 case ISD::FP_EXTEND:
34609 case ISD::STRICT_FP_EXTEND: {
34610 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34611 // No other ValueType for FP_EXTEND should reach this point.
34612 assert(N->getValueType(0) == MVT::v2f32 &&
34613 "Do not know how to legalize this Node");
34614 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34615 return;
34616 bool IsStrict = N->isStrictFPOpcode();
34617 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34618 if (Src.getValueType().getVectorElementType() != MVT::f16)
34619 return;
34620 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34621 : DAG.getUNDEF(MVT::v2f16);
34622 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34623 if (IsStrict)
34624 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34625 {N->getOperand(0), V});
34626 else
34627 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34628 Results.push_back(V);
34629 if (IsStrict)
34630 Results.push_back(V.getValue(1));
34631 return;
34632 }
34634 unsigned IntNo = N->getConstantOperandVal(1);
34635 switch (IntNo) {
34636 default : llvm_unreachable("Do not know how to custom type "
34637 "legalize this intrinsic operation!");
34638 case Intrinsic::x86_rdtsc:
34639 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34640 Results);
34641 case Intrinsic::x86_rdtscp:
34642 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34643 Results);
34644 case Intrinsic::x86_rdpmc:
34645 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34646 Results);
34647 return;
34648 case Intrinsic::x86_rdpru:
34649 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34650 Results);
34651 return;
34652 case Intrinsic::x86_xgetbv:
34653 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34654 Results);
34655 return;
34656 }
34657 }
34658 case ISD::READCYCLECOUNTER: {
34659 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34660 }
34661 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34662 EVT T = N->getValueType(0);
34663 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34664 bool Regs64bit = T == MVT::i128;
34665 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34666 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34667 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34668 SDValue cpInL, cpInH;
34669 std::tie(cpInL, cpInH) =
34670 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34671 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34672 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34673 cpInH =
34674 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34675 cpInH, cpInL.getValue(1));
34676 SDValue swapInL, swapInH;
34677 std::tie(swapInL, swapInH) =
34678 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34679 swapInH =
34680 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34681 swapInH, cpInH.getValue(1));
34682
34683 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34684 // until later. So we keep the RBX input in a vreg and use a custom
34685 // inserter.
34686 // Since RBX will be a reserved register the register allocator will not
34687 // make sure its value will be properly saved and restored around this
34688 // live-range.
34689 SDValue Result;
34690 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34691 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34692 if (Regs64bit) {
34693 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34694 swapInH.getValue(1)};
34695 Result =
34696 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34697 } else {
34698 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34699 swapInH.getValue(1));
34700 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34701 swapInL.getValue(1)};
34702 Result =
34703 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34704 }
34705
34706 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34707 Regs64bit ? X86::RAX : X86::EAX,
34708 HalfT, Result.getValue(1));
34709 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34710 Regs64bit ? X86::RDX : X86::EDX,
34711 HalfT, cpOutL.getValue(2));
34712 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34713
34714 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34715 MVT::i32, cpOutH.getValue(2));
34716 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34717 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34718
34719 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34720 Results.push_back(Success);
34721 Results.push_back(EFLAGS.getValue(1));
34722 return;
34723 }
34724 case ISD::ATOMIC_LOAD: {
34725 assert(
34726 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34727 "Unexpected VT!");
34728 bool NoImplicitFloatOps =
34730 Attribute::NoImplicitFloat);
34731 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34732 auto *Node = cast<AtomicSDNode>(N);
34733
34734 if (N->getValueType(0) == MVT::i128) {
34735 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34736 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34737 Node->getBasePtr(), Node->getMemOperand());
34738 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34739 DAG.getVectorIdxConstant(0, dl));
34740 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34741 DAG.getVectorIdxConstant(1, dl));
34742 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34743 {ResL, ResH}));
34744 Results.push_back(Ld.getValue(1));
34745 return;
34746 }
34747 break;
34748 }
34749 if (Subtarget.hasSSE1()) {
34750 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34751 // Then extract the lower 64-bits.
34752 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34753 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34754 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34756 MVT::i64, Node->getMemOperand());
34757 if (Subtarget.hasSSE2()) {
34758 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34759 DAG.getVectorIdxConstant(0, dl));
34760 Results.push_back(Res);
34761 Results.push_back(Ld.getValue(1));
34762 return;
34763 }
34764 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34765 // then casts to i64. This avoids a 128-bit stack temporary being
34766 // created by type legalization if we were to cast v4f32->v2i64.
34767 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34768 DAG.getVectorIdxConstant(0, dl));
34769 Res = DAG.getBitcast(MVT::i64, Res);
34770 Results.push_back(Res);
34771 Results.push_back(Ld.getValue(1));
34772 return;
34773 }
34774 if (Subtarget.hasX87()) {
34775 // First load this into an 80-bit X87 register. This will put the whole
34776 // integer into the significand.
34777 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34778 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34780 dl, Tys, Ops, MVT::i64,
34781 Node->getMemOperand());
34782 SDValue Chain = Result.getValue(1);
34783
34784 // Now store the X87 register to a stack temporary and convert to i64.
34785 // This store is not atomic and doesn't need to be.
34786 // FIXME: We don't need a stack temporary if the result of the load
34787 // is already being stored. We could just directly store there.
34788 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34789 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34790 MachinePointerInfo MPI =
34792 SDValue StoreOps[] = { Chain, Result, StackPtr };
34793 Chain = DAG.getMemIntrinsicNode(
34794 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34795 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34796
34797 // Finally load the value back from the stack temporary and return it.
34798 // This load is not atomic and doesn't need to be.
34799 // This load will be further type legalized.
34800 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34801 Results.push_back(Result);
34802 Results.push_back(Result.getValue(1));
34803 return;
34804 }
34805 }
34806 // TODO: Use MOVLPS when SSE1 is available?
34807 // Delegate to generic TypeLegalization. Situations we can really handle
34808 // should have already been dealt with by AtomicExpandPass.cpp.
34809 break;
34810 }
34811 case ISD::ATOMIC_SWAP:
34812 case ISD::ATOMIC_LOAD_ADD:
34813 case ISD::ATOMIC_LOAD_SUB:
34814 case ISD::ATOMIC_LOAD_AND:
34815 case ISD::ATOMIC_LOAD_OR:
34816 case ISD::ATOMIC_LOAD_XOR:
34817 case ISD::ATOMIC_LOAD_NAND:
34818 case ISD::ATOMIC_LOAD_MIN:
34819 case ISD::ATOMIC_LOAD_MAX:
34820 case ISD::ATOMIC_LOAD_UMIN:
34821 case ISD::ATOMIC_LOAD_UMAX:
34822 // Delegate to generic TypeLegalization. Situations we can really handle
34823 // should have already been dealt with by AtomicExpandPass.cpp.
34824 break;
34825
34826 case ISD::BITCAST: {
34827 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34828 EVT DstVT = N->getValueType(0);
34829 EVT SrcVT = N->getOperand(0).getValueType();
34830
34831 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34832 // we can split using the k-register rather than memory.
34833 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34834 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34835 SDValue Lo, Hi;
34836 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34837 Lo = DAG.getBitcast(MVT::i32, Lo);
34838 Hi = DAG.getBitcast(MVT::i32, Hi);
34839 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34840 Results.push_back(Res);
34841 return;
34842 }
34843
34844 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34845 // FIXME: Use v4f32 for SSE1?
34846 assert(Subtarget.hasSSE2() && "Requires SSE2");
34847 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34848 "Unexpected type action!");
34849 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34850 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34851 N->getOperand(0));
34852 Res = DAG.getBitcast(WideVT, Res);
34853 Results.push_back(Res);
34854 return;
34855 }
34856
34857 return;
34858 }
34859 case ISD::MGATHER: {
34860 EVT VT = N->getValueType(0);
34861 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34862 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34863 auto *Gather = cast<MaskedGatherSDNode>(N);
34864 SDValue Index = Gather->getIndex();
34865 if (Index.getValueType() != MVT::v2i64)
34866 return;
34868 "Unexpected type action!");
34869 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34870 SDValue Mask = Gather->getMask();
34871 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34872 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34873 Gather->getPassThru(),
34874 DAG.getUNDEF(VT));
34875 if (!Subtarget.hasVLX()) {
34876 // We need to widen the mask, but the instruction will only use 2
34877 // of its elements. So we can use undef.
34878 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34879 DAG.getUNDEF(MVT::v2i1));
34880 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34881 }
34882 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34883 Gather->getBasePtr(), Index, Gather->getScale() };
34884 SDValue Res = DAG.getMemIntrinsicNode(
34885 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34886 Gather->getMemoryVT(), Gather->getMemOperand());
34887 Results.push_back(Res);
34888 Results.push_back(Res.getValue(1));
34889 return;
34890 }
34891 return;
34892 }
34893 case ISD::LOAD: {
34894 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34895 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34896 // cast since type legalization will try to use an i64 load.
34897 MVT VT = N->getSimpleValueType(0);
34898 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34900 "Unexpected type action!");
34901 if (!ISD::isNON_EXTLoad(N))
34902 return;
34903 auto *Ld = cast<LoadSDNode>(N);
34904 if (Subtarget.hasSSE2()) {
34905 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34906 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34907 Ld->getPointerInfo(), Ld->getBaseAlign(),
34908 Ld->getMemOperand()->getFlags());
34909 SDValue Chain = Res.getValue(1);
34910 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34911 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34912 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34913 Res = DAG.getBitcast(WideVT, Res);
34914 Results.push_back(Res);
34915 Results.push_back(Chain);
34916 return;
34917 }
34918 assert(Subtarget.hasSSE1() && "Expected SSE");
34919 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34920 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34922 MVT::i64, Ld->getMemOperand());
34923 Results.push_back(Res);
34924 Results.push_back(Res.getValue(1));
34925 return;
34926 }
34927 case ISD::ADDRSPACECAST: {
34928 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34929 Results.push_back(V);
34930 return;
34931 }
34932 case ISD::BITREVERSE: {
34933 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34934 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34935 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34936 // We'll need to move the scalar in two i32 pieces.
34937 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34938 return;
34939 }
34941 // f16 = extract vXf16 %vec, i64 %idx
34942 assert(N->getSimpleValueType(0) == MVT::f16 &&
34943 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34944 assert(Subtarget.hasFP16() && "Expected FP16");
34945 SDValue VecOp = N->getOperand(0);
34947 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34948 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34949 N->getOperand(1));
34950 Split = DAG.getBitcast(MVT::f16, Split);
34951 Results.push_back(Split);
34952 return;
34953 }
34954 }
34955}
34956
34957const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34958 switch ((X86ISD::NodeType)Opcode) {
34959 case X86ISD::FIRST_NUMBER: break;
34960#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34961 NODE_NAME_CASE(BSF)
34962 NODE_NAME_CASE(BSR)
34963 NODE_NAME_CASE(FSHL)
34964 NODE_NAME_CASE(FSHR)
34965 NODE_NAME_CASE(FAND)
34966 NODE_NAME_CASE(FANDN)
34967 NODE_NAME_CASE(FOR)
34968 NODE_NAME_CASE(FXOR)
34969 NODE_NAME_CASE(FILD)
34970 NODE_NAME_CASE(FIST)
34971 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34972 NODE_NAME_CASE(FLD)
34973 NODE_NAME_CASE(FST)
34974 NODE_NAME_CASE(CALL)
34975 NODE_NAME_CASE(CALL_RVMARKER)
34976 NODE_NAME_CASE(IMP_CALL)
34978 NODE_NAME_CASE(CMP)
34979 NODE_NAME_CASE(FCMP)
34980 NODE_NAME_CASE(STRICT_FCMP)
34981 NODE_NAME_CASE(STRICT_FCMPS)
34983 NODE_NAME_CASE(UCOMI)
34984 NODE_NAME_CASE(COMX)
34985 NODE_NAME_CASE(UCOMX)
34986 NODE_NAME_CASE(CMPM)
34987 NODE_NAME_CASE(CMPMM)
34988 NODE_NAME_CASE(STRICT_CMPM)
34989 NODE_NAME_CASE(CMPMM_SAE)
34990 NODE_NAME_CASE(SETCC)
34991 NODE_NAME_CASE(SETCC_CARRY)
34992 NODE_NAME_CASE(FSETCC)
34993 NODE_NAME_CASE(FSETCCM)
34994 NODE_NAME_CASE(FSETCCM_SAE)
34995 NODE_NAME_CASE(CMOV)
34996 NODE_NAME_CASE(BRCOND)
34997 NODE_NAME_CASE(RET_GLUE)
34998 NODE_NAME_CASE(IRET)
34999 NODE_NAME_CASE(REP_STOS)
35000 NODE_NAME_CASE(REP_MOVS)
35001 NODE_NAME_CASE(GlobalBaseReg)
35003 NODE_NAME_CASE(WrapperRIP)
35004 NODE_NAME_CASE(MOVQ2DQ)
35005 NODE_NAME_CASE(MOVDQ2Q)
35006 NODE_NAME_CASE(MMX_MOVD2W)
35007 NODE_NAME_CASE(MMX_MOVW2D)
35008 NODE_NAME_CASE(PEXTRB)
35009 NODE_NAME_CASE(PEXTRW)
35010 NODE_NAME_CASE(INSERTPS)
35011 NODE_NAME_CASE(PINSRB)
35012 NODE_NAME_CASE(PINSRW)
35013 NODE_NAME_CASE(PSHUFB)
35014 NODE_NAME_CASE(ANDNP)
35015 NODE_NAME_CASE(BLENDI)
35017 NODE_NAME_CASE(HADD)
35018 NODE_NAME_CASE(HSUB)
35019 NODE_NAME_CASE(FHADD)
35020 NODE_NAME_CASE(FHSUB)
35021 NODE_NAME_CASE(CONFLICT)
35022 NODE_NAME_CASE(FMAX)
35023 NODE_NAME_CASE(FMAXS)
35024 NODE_NAME_CASE(FMAX_SAE)
35025 NODE_NAME_CASE(FMAXS_SAE)
35026 NODE_NAME_CASE(STRICT_FMAX)
35027 NODE_NAME_CASE(FMIN)
35028 NODE_NAME_CASE(FMINS)
35029 NODE_NAME_CASE(FMIN_SAE)
35030 NODE_NAME_CASE(FMINS_SAE)
35031 NODE_NAME_CASE(STRICT_FMIN)
35032 NODE_NAME_CASE(FMAXC)
35033 NODE_NAME_CASE(FMINC)
35034 NODE_NAME_CASE(FRSQRT)
35035 NODE_NAME_CASE(FRCP)
35036 NODE_NAME_CASE(EXTRQI)
35037 NODE_NAME_CASE(INSERTQI)
35038 NODE_NAME_CASE(TLSADDR)
35039 NODE_NAME_CASE(TLSBASEADDR)
35040 NODE_NAME_CASE(TLSCALL)
35041 NODE_NAME_CASE(TLSDESC)
35042 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35043 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35044 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35045 NODE_NAME_CASE(EH_RETURN)
35046 NODE_NAME_CASE(TC_RETURN)
35047 NODE_NAME_CASE(FNSTCW16m)
35048 NODE_NAME_CASE(FLDCW16m)
35049 NODE_NAME_CASE(FNSTENVm)
35050 NODE_NAME_CASE(FLDENVm)
35051 NODE_NAME_CASE(LCMPXCHG_DAG)
35052 NODE_NAME_CASE(LCMPXCHG8_DAG)
35053 NODE_NAME_CASE(LCMPXCHG16_DAG)
35054 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35055 NODE_NAME_CASE(LADD)
35056 NODE_NAME_CASE(LSUB)
35057 NODE_NAME_CASE(LOR)
35058 NODE_NAME_CASE(LXOR)
35059 NODE_NAME_CASE(LAND)
35060 NODE_NAME_CASE(LBTS)
35061 NODE_NAME_CASE(LBTC)
35062 NODE_NAME_CASE(LBTR)
35063 NODE_NAME_CASE(LBTS_RM)
35064 NODE_NAME_CASE(LBTC_RM)
35065 NODE_NAME_CASE(LBTR_RM)
35066 NODE_NAME_CASE(AADD)
35067 NODE_NAME_CASE(AOR)
35068 NODE_NAME_CASE(AXOR)
35069 NODE_NAME_CASE(AAND)
35070 NODE_NAME_CASE(VZEXT_MOVL)
35071 NODE_NAME_CASE(VZEXT_LOAD)
35072 NODE_NAME_CASE(VEXTRACT_STORE)
35073 NODE_NAME_CASE(VTRUNC)
35074 NODE_NAME_CASE(VTRUNCS)
35075 NODE_NAME_CASE(VTRUNCUS)
35076 NODE_NAME_CASE(VMTRUNC)
35077 NODE_NAME_CASE(VMTRUNCS)
35078 NODE_NAME_CASE(VMTRUNCUS)
35079 NODE_NAME_CASE(VTRUNCSTORES)
35080 NODE_NAME_CASE(VTRUNCSTOREUS)
35081 NODE_NAME_CASE(VMTRUNCSTORES)
35082 NODE_NAME_CASE(VMTRUNCSTOREUS)
35083 NODE_NAME_CASE(VFPEXT)
35084 NODE_NAME_CASE(STRICT_VFPEXT)
35085 NODE_NAME_CASE(VFPEXT_SAE)
35086 NODE_NAME_CASE(VFPEXTS)
35087 NODE_NAME_CASE(VFPEXTS_SAE)
35088 NODE_NAME_CASE(VFPROUND)
35089 NODE_NAME_CASE(VFPROUND2)
35090 NODE_NAME_CASE(VFPROUND2_RND)
35091 NODE_NAME_CASE(STRICT_VFPROUND)
35092 NODE_NAME_CASE(VMFPROUND)
35093 NODE_NAME_CASE(VFPROUND_RND)
35094 NODE_NAME_CASE(VFPROUNDS)
35095 NODE_NAME_CASE(VFPROUNDS_RND)
35096 NODE_NAME_CASE(VSHLDQ)
35097 NODE_NAME_CASE(VSRLDQ)
35098 NODE_NAME_CASE(VSHL)
35099 NODE_NAME_CASE(VSRL)
35100 NODE_NAME_CASE(VSRA)
35101 NODE_NAME_CASE(VSHLI)
35102 NODE_NAME_CASE(VSRLI)
35103 NODE_NAME_CASE(VSRAI)
35104 NODE_NAME_CASE(VSHLV)
35105 NODE_NAME_CASE(VSRLV)
35106 NODE_NAME_CASE(VSRAV)
35107 NODE_NAME_CASE(VROTLI)
35108 NODE_NAME_CASE(VROTRI)
35109 NODE_NAME_CASE(VPPERM)
35110 NODE_NAME_CASE(CMPP)
35111 NODE_NAME_CASE(STRICT_CMPP)
35112 NODE_NAME_CASE(PCMPEQ)
35113 NODE_NAME_CASE(PCMPGT)
35114 NODE_NAME_CASE(PHMINPOS)
35115 NODE_NAME_CASE(ADD)
35116 NODE_NAME_CASE(SUB)
35117 NODE_NAME_CASE(ADC)
35118 NODE_NAME_CASE(SBB)
35119 NODE_NAME_CASE(SMUL)
35120 NODE_NAME_CASE(UMUL)
35121 NODE_NAME_CASE(OR)
35122 NODE_NAME_CASE(XOR)
35123 NODE_NAME_CASE(AND)
35124 NODE_NAME_CASE(BEXTR)
35126 NODE_NAME_CASE(BZHI)
35127 NODE_NAME_CASE(PDEP)
35128 NODE_NAME_CASE(PEXT)
35129 NODE_NAME_CASE(MUL_IMM)
35130 NODE_NAME_CASE(MOVMSK)
35131 NODE_NAME_CASE(PTEST)
35132 NODE_NAME_CASE(TESTP)
35133 NODE_NAME_CASE(KORTEST)
35134 NODE_NAME_CASE(KTEST)
35135 NODE_NAME_CASE(KADD)
35136 NODE_NAME_CASE(KSHIFTL)
35137 NODE_NAME_CASE(KSHIFTR)
35138 NODE_NAME_CASE(PACKSS)
35139 NODE_NAME_CASE(PACKUS)
35140 NODE_NAME_CASE(PALIGNR)
35141 NODE_NAME_CASE(VALIGN)
35142 NODE_NAME_CASE(VSHLD)
35143 NODE_NAME_CASE(VSHRD)
35144 NODE_NAME_CASE(VSHLDV)
35145 NODE_NAME_CASE(VSHRDV)
35146 NODE_NAME_CASE(PSHUFD)
35147 NODE_NAME_CASE(PSHUFHW)
35148 NODE_NAME_CASE(PSHUFLW)
35149 NODE_NAME_CASE(SHUFP)
35150 NODE_NAME_CASE(SHUF128)
35151 NODE_NAME_CASE(MOVLHPS)
35152 NODE_NAME_CASE(MOVHLPS)
35153 NODE_NAME_CASE(MOVDDUP)
35154 NODE_NAME_CASE(MOVSHDUP)
35155 NODE_NAME_CASE(MOVSLDUP)
35156 NODE_NAME_CASE(MOVSD)
35157 NODE_NAME_CASE(MOVSS)
35158 NODE_NAME_CASE(MOVSH)
35159 NODE_NAME_CASE(UNPCKL)
35160 NODE_NAME_CASE(UNPCKH)
35161 NODE_NAME_CASE(VBROADCAST)
35162 NODE_NAME_CASE(VBROADCAST_LOAD)
35163 NODE_NAME_CASE(VBROADCASTM)
35164 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35165 NODE_NAME_CASE(VPERMILPV)
35166 NODE_NAME_CASE(VPERMILPI)
35167 NODE_NAME_CASE(VPERM2X128)
35168 NODE_NAME_CASE(VPERMV)
35169 NODE_NAME_CASE(VPERMV3)
35170 NODE_NAME_CASE(VPERMI)
35171 NODE_NAME_CASE(VPTERNLOG)
35172 NODE_NAME_CASE(FP_TO_SINT_SAT)
35173 NODE_NAME_CASE(FP_TO_UINT_SAT)
35174 NODE_NAME_CASE(VFIXUPIMM)
35175 NODE_NAME_CASE(VFIXUPIMM_SAE)
35176 NODE_NAME_CASE(VFIXUPIMMS)
35177 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35178 NODE_NAME_CASE(VRANGE)
35179 NODE_NAME_CASE(VRANGE_SAE)
35180 NODE_NAME_CASE(VRANGES)
35181 NODE_NAME_CASE(VRANGES_SAE)
35182 NODE_NAME_CASE(PMULUDQ)
35183 NODE_NAME_CASE(PMULDQ)
35184 NODE_NAME_CASE(PSADBW)
35185 NODE_NAME_CASE(DBPSADBW)
35186 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35187 NODE_NAME_CASE(VAARG_64)
35188 NODE_NAME_CASE(VAARG_X32)
35189 NODE_NAME_CASE(DYN_ALLOCA)
35190 NODE_NAME_CASE(MFENCE)
35191 NODE_NAME_CASE(SEG_ALLOCA)
35192 NODE_NAME_CASE(PROBED_ALLOCA)
35195 NODE_NAME_CASE(RDPKRU)
35196 NODE_NAME_CASE(WRPKRU)
35197 NODE_NAME_CASE(VPMADDUBSW)
35198 NODE_NAME_CASE(VPMADDWD)
35199 NODE_NAME_CASE(VPSHA)
35200 NODE_NAME_CASE(VPSHL)
35201 NODE_NAME_CASE(VPCOM)
35202 NODE_NAME_CASE(VPCOMU)
35203 NODE_NAME_CASE(VPERMIL2)
35205 NODE_NAME_CASE(STRICT_FMSUB)
35207 NODE_NAME_CASE(STRICT_FNMADD)
35209 NODE_NAME_CASE(STRICT_FNMSUB)
35210 NODE_NAME_CASE(FMADDSUB)
35211 NODE_NAME_CASE(FMSUBADD)
35212 NODE_NAME_CASE(FMADD_RND)
35213 NODE_NAME_CASE(FNMADD_RND)
35214 NODE_NAME_CASE(FMSUB_RND)
35215 NODE_NAME_CASE(FNMSUB_RND)
35216 NODE_NAME_CASE(FMADDSUB_RND)
35217 NODE_NAME_CASE(FMSUBADD_RND)
35218 NODE_NAME_CASE(VFMADDC)
35219 NODE_NAME_CASE(VFMADDC_RND)
35220 NODE_NAME_CASE(VFCMADDC)
35221 NODE_NAME_CASE(VFCMADDC_RND)
35222 NODE_NAME_CASE(VFMULC)
35223 NODE_NAME_CASE(VFMULC_RND)
35224 NODE_NAME_CASE(VFCMULC)
35225 NODE_NAME_CASE(VFCMULC_RND)
35226 NODE_NAME_CASE(VFMULCSH)
35227 NODE_NAME_CASE(VFMULCSH_RND)
35228 NODE_NAME_CASE(VFCMULCSH)
35229 NODE_NAME_CASE(VFCMULCSH_RND)
35230 NODE_NAME_CASE(VFMADDCSH)
35231 NODE_NAME_CASE(VFMADDCSH_RND)
35232 NODE_NAME_CASE(VFCMADDCSH)
35233 NODE_NAME_CASE(VFCMADDCSH_RND)
35234 NODE_NAME_CASE(VPMADD52H)
35235 NODE_NAME_CASE(VPMADD52L)
35236 NODE_NAME_CASE(VRNDSCALE)
35237 NODE_NAME_CASE(STRICT_VRNDSCALE)
35238 NODE_NAME_CASE(VRNDSCALE_SAE)
35239 NODE_NAME_CASE(VRNDSCALES)
35240 NODE_NAME_CASE(VRNDSCALES_SAE)
35241 NODE_NAME_CASE(VREDUCE)
35242 NODE_NAME_CASE(VREDUCE_SAE)
35243 NODE_NAME_CASE(VREDUCES)
35244 NODE_NAME_CASE(VREDUCES_SAE)
35245 NODE_NAME_CASE(VGETMANT)
35246 NODE_NAME_CASE(VGETMANT_SAE)
35247 NODE_NAME_CASE(VGETMANTS)
35248 NODE_NAME_CASE(VGETMANTS_SAE)
35249 NODE_NAME_CASE(PCMPESTR)
35250 NODE_NAME_CASE(PCMPISTR)
35252 NODE_NAME_CASE(COMPRESS)
35254 NODE_NAME_CASE(SELECTS)
35255 NODE_NAME_CASE(ADDSUB)
35256 NODE_NAME_CASE(RCP14)
35257 NODE_NAME_CASE(RCP14S)
35258 NODE_NAME_CASE(RSQRT14)
35259 NODE_NAME_CASE(RSQRT14S)
35260 NODE_NAME_CASE(FADD_RND)
35261 NODE_NAME_CASE(FADDS)
35262 NODE_NAME_CASE(FADDS_RND)
35263 NODE_NAME_CASE(FSUB_RND)
35264 NODE_NAME_CASE(FSUBS)
35265 NODE_NAME_CASE(FSUBS_RND)
35266 NODE_NAME_CASE(FMUL_RND)
35267 NODE_NAME_CASE(FMULS)
35268 NODE_NAME_CASE(FMULS_RND)
35269 NODE_NAME_CASE(FDIV_RND)
35270 NODE_NAME_CASE(FDIVS)
35271 NODE_NAME_CASE(FDIVS_RND)
35272 NODE_NAME_CASE(FSQRT_RND)
35273 NODE_NAME_CASE(FSQRTS)
35274 NODE_NAME_CASE(FSQRTS_RND)
35275 NODE_NAME_CASE(FGETEXP)
35276 NODE_NAME_CASE(FGETEXP_SAE)
35277 NODE_NAME_CASE(FGETEXPS)
35278 NODE_NAME_CASE(FGETEXPS_SAE)
35279 NODE_NAME_CASE(SCALEF)
35280 NODE_NAME_CASE(SCALEF_RND)
35281 NODE_NAME_CASE(SCALEFS)
35282 NODE_NAME_CASE(SCALEFS_RND)
35283 NODE_NAME_CASE(MULHRS)
35284 NODE_NAME_CASE(SINT_TO_FP_RND)
35285 NODE_NAME_CASE(UINT_TO_FP_RND)
35286 NODE_NAME_CASE(CVTTP2SI)
35287 NODE_NAME_CASE(CVTTP2UI)
35288 NODE_NAME_CASE(STRICT_CVTTP2SI)
35289 NODE_NAME_CASE(STRICT_CVTTP2UI)
35290 NODE_NAME_CASE(MCVTTP2SI)
35291 NODE_NAME_CASE(MCVTTP2UI)
35292 NODE_NAME_CASE(CVTTP2SI_SAE)
35293 NODE_NAME_CASE(CVTTP2UI_SAE)
35294 NODE_NAME_CASE(CVTTS2SI)
35295 NODE_NAME_CASE(CVTTS2UI)
35296 NODE_NAME_CASE(CVTTS2SI_SAE)
35297 NODE_NAME_CASE(CVTTS2UI_SAE)
35298 NODE_NAME_CASE(CVTSI2P)
35299 NODE_NAME_CASE(CVTUI2P)
35300 NODE_NAME_CASE(STRICT_CVTSI2P)
35301 NODE_NAME_CASE(STRICT_CVTUI2P)
35302 NODE_NAME_CASE(MCVTSI2P)
35303 NODE_NAME_CASE(MCVTUI2P)
35304 NODE_NAME_CASE(VFPCLASS)
35305 NODE_NAME_CASE(VFPCLASSS)
35306 NODE_NAME_CASE(MULTISHIFT)
35307 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35308 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35309 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35310 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35311 NODE_NAME_CASE(CVTPS2PH)
35312 NODE_NAME_CASE(STRICT_CVTPS2PH)
35313 NODE_NAME_CASE(CVTPS2PH_SAE)
35314 NODE_NAME_CASE(MCVTPS2PH)
35315 NODE_NAME_CASE(MCVTPS2PH_SAE)
35316 NODE_NAME_CASE(CVTPH2PS)
35317 NODE_NAME_CASE(STRICT_CVTPH2PS)
35318 NODE_NAME_CASE(CVTPH2PS_SAE)
35319 NODE_NAME_CASE(CVTP2SI)
35320 NODE_NAME_CASE(CVTP2UI)
35321 NODE_NAME_CASE(MCVTP2SI)
35322 NODE_NAME_CASE(MCVTP2UI)
35323 NODE_NAME_CASE(CVTP2SI_RND)
35324 NODE_NAME_CASE(CVTP2UI_RND)
35325 NODE_NAME_CASE(CVTS2SI)
35326 NODE_NAME_CASE(CVTS2UI)
35327 NODE_NAME_CASE(CVTS2SI_RND)
35328 NODE_NAME_CASE(CVTS2UI_RND)
35329 NODE_NAME_CASE(CVTNEPS2BF16)
35330 NODE_NAME_CASE(MCVTNEPS2BF16)
35331 NODE_NAME_CASE(DPBF16PS)
35332 NODE_NAME_CASE(DPFP16PS)
35333 NODE_NAME_CASE(MPSADBW)
35334 NODE_NAME_CASE(LWPINS)
35335 NODE_NAME_CASE(MGATHER)
35336 NODE_NAME_CASE(MSCATTER)
35337 NODE_NAME_CASE(VPDPBUSD)
35338 NODE_NAME_CASE(VPDPBUSDS)
35339 NODE_NAME_CASE(VPDPWSSD)
35340 NODE_NAME_CASE(VPDPWSSDS)
35341 NODE_NAME_CASE(VPSHUFBITQMB)
35342 NODE_NAME_CASE(GF2P8MULB)
35343 NODE_NAME_CASE(GF2P8AFFINEQB)
35344 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35345 NODE_NAME_CASE(NT_CALL)
35346 NODE_NAME_CASE(NT_BRIND)
35347 NODE_NAME_CASE(UMWAIT)
35348 NODE_NAME_CASE(TPAUSE)
35349 NODE_NAME_CASE(ENQCMD)
35350 NODE_NAME_CASE(ENQCMDS)
35351 NODE_NAME_CASE(VP2INTERSECT)
35352 NODE_NAME_CASE(VPDPBSUD)
35353 NODE_NAME_CASE(VPDPBSUDS)
35354 NODE_NAME_CASE(VPDPBUUD)
35355 NODE_NAME_CASE(VPDPBUUDS)
35356 NODE_NAME_CASE(VPDPBSSD)
35357 NODE_NAME_CASE(VPDPBSSDS)
35358 NODE_NAME_CASE(VPDPWSUD)
35359 NODE_NAME_CASE(VPDPWSUDS)
35360 NODE_NAME_CASE(VPDPWUSD)
35361 NODE_NAME_CASE(VPDPWUSDS)
35362 NODE_NAME_CASE(VPDPWUUD)
35363 NODE_NAME_CASE(VPDPWUUDS)
35364 NODE_NAME_CASE(VMINMAX)
35365 NODE_NAME_CASE(VMINMAX_SAE)
35366 NODE_NAME_CASE(VMINMAXS)
35367 NODE_NAME_CASE(VMINMAXS_SAE)
35368 NODE_NAME_CASE(CVTP2IBS)
35369 NODE_NAME_CASE(CVTP2IUBS)
35370 NODE_NAME_CASE(CVTP2IBS_RND)
35371 NODE_NAME_CASE(CVTP2IUBS_RND)
35372 NODE_NAME_CASE(CVTTP2IBS)
35373 NODE_NAME_CASE(CVTTP2IUBS)
35374 NODE_NAME_CASE(CVTTP2IBS_SAE)
35375 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35376 NODE_NAME_CASE(VCVT2PH2BF8)
35377 NODE_NAME_CASE(VCVT2PH2BF8S)
35378 NODE_NAME_CASE(VCVT2PH2HF8)
35379 NODE_NAME_CASE(VCVT2PH2HF8S)
35380 NODE_NAME_CASE(VCVTBIASPH2BF8)
35381 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35382 NODE_NAME_CASE(VCVTBIASPH2HF8)
35383 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35384 NODE_NAME_CASE(VCVTPH2BF8)
35385 NODE_NAME_CASE(VCVTPH2BF8S)
35386 NODE_NAME_CASE(VCVTPH2HF8)
35387 NODE_NAME_CASE(VCVTPH2HF8S)
35388 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35389 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35390 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35391 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35392 NODE_NAME_CASE(VMCVTPH2BF8)
35393 NODE_NAME_CASE(VMCVTPH2BF8S)
35394 NODE_NAME_CASE(VMCVTPH2HF8)
35395 NODE_NAME_CASE(VMCVTPH2HF8S)
35396 NODE_NAME_CASE(VCVTHF82PH)
35397 NODE_NAME_CASE(AESENC128KL)
35398 NODE_NAME_CASE(AESDEC128KL)
35399 NODE_NAME_CASE(AESENC256KL)
35400 NODE_NAME_CASE(AESDEC256KL)
35401 NODE_NAME_CASE(AESENCWIDE128KL)
35402 NODE_NAME_CASE(AESDECWIDE128KL)
35403 NODE_NAME_CASE(AESENCWIDE256KL)
35404 NODE_NAME_CASE(AESDECWIDE256KL)
35405 NODE_NAME_CASE(CMPCCXADD)
35406 NODE_NAME_CASE(TESTUI)
35407 NODE_NAME_CASE(FP80_ADD)
35408 NODE_NAME_CASE(STRICT_FP80_ADD)
35409 NODE_NAME_CASE(CCMP)
35410 NODE_NAME_CASE(CTEST)
35411 NODE_NAME_CASE(CLOAD)
35412 NODE_NAME_CASE(CSTORE)
35413 NODE_NAME_CASE(CVTTS2SIS)
35414 NODE_NAME_CASE(CVTTS2UIS)
35415 NODE_NAME_CASE(CVTTS2SIS_SAE)
35416 NODE_NAME_CASE(CVTTS2UIS_SAE)
35417 NODE_NAME_CASE(CVTTP2SIS)
35418 NODE_NAME_CASE(MCVTTP2SIS)
35419 NODE_NAME_CASE(CVTTP2UIS_SAE)
35420 NODE_NAME_CASE(CVTTP2SIS_SAE)
35421 NODE_NAME_CASE(CVTTP2UIS)
35422 NODE_NAME_CASE(MCVTTP2UIS)
35423 NODE_NAME_CASE(POP_FROM_X87_REG)
35424 }
35425 return nullptr;
35426#undef NODE_NAME_CASE
35427}
35428
35429/// Return true if the addressing mode represented by AM is legal for this
35430/// target, for a load/store of the specified type.
35432 const AddrMode &AM, Type *Ty,
35433 unsigned AS,
35434 Instruction *I) const {
35435 // X86 supports extremely general addressing modes.
35437
35438 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35439 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35440 return false;
35441
35442 if (AM.BaseGV) {
35443 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35444
35445 // If a reference to this global requires an extra load, we can't fold it.
35446 if (isGlobalStubReference(GVFlags))
35447 return false;
35448
35449 // If BaseGV requires a register for the PIC base, we cannot also have a
35450 // BaseReg specified.
35451 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35452 return false;
35453
35454 // If lower 4G is not available, then we must use rip-relative addressing.
35455 if ((M != CodeModel::Small || isPositionIndependent()) &&
35456 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35457 return false;
35458 }
35459
35460 switch (AM.Scale) {
35461 case 0:
35462 case 1:
35463 case 2:
35464 case 4:
35465 case 8:
35466 // These scales always work.
35467 break;
35468 case 3:
35469 case 5:
35470 case 9:
35471 // These scales are formed with basereg+scalereg. Only accept if there is
35472 // no basereg yet.
35473 if (AM.HasBaseReg)
35474 return false;
35475 break;
35476 default: // Other stuff never works.
35477 return false;
35478 }
35479
35480 return true;
35481}
35482
35483bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35484 switch (Opcode) {
35485 // These are non-commutative binops.
35486 // TODO: Add more X86ISD opcodes once we have test coverage.
35487 case X86ISD::ANDNP:
35488 case X86ISD::PCMPGT:
35489 case X86ISD::FMAX:
35490 case X86ISD::FMIN:
35491 case X86ISD::FANDN:
35492 case X86ISD::VPSHA:
35493 case X86ISD::VPSHL:
35494 case X86ISD::VSHLV:
35495 case X86ISD::VSRLV:
35496 case X86ISD::VSRAV:
35497 return true;
35498 }
35499
35500 return TargetLoweringBase::isBinOp(Opcode);
35501}
35502
35503bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35504 switch (Opcode) {
35505 // TODO: Add more X86ISD opcodes once we have test coverage.
35506 case X86ISD::PCMPEQ:
35507 case X86ISD::PMULDQ:
35508 case X86ISD::PMULUDQ:
35509 case X86ISD::FMAXC:
35510 case X86ISD::FMINC:
35511 case X86ISD::FAND:
35512 case X86ISD::FOR:
35513 case X86ISD::FXOR:
35514 return true;
35515 }
35516
35518}
35519
35521 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35522 return false;
35523 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35524 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35525 return NumBits1 > NumBits2;
35526}
35527
35529 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35530 return false;
35531
35532 if (!isTypeLegal(EVT::getEVT(Ty1)))
35533 return false;
35534
35535 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35536
35537 // Assuming the caller doesn't have a zeroext or signext return parameter,
35538 // truncation all the way down to i1 is valid.
35539 return true;
35540}
35541
35543 return isInt<32>(Imm);
35544}
35545
35547 // Can also use sub to handle negated immediates.
35548 return isInt<32>(Imm);
35549}
35550
35552 return isInt<32>(Imm);
35553}
35554
35556 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35557 return false;
35558 unsigned NumBits1 = VT1.getSizeInBits();
35559 unsigned NumBits2 = VT2.getSizeInBits();
35560 return NumBits1 > NumBits2;
35561}
35562
35564 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35565 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35566}
35567
35569 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35570 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35571}
35572
35574 EVT VT1 = Val.getValueType();
35575 if (isZExtFree(VT1, VT2))
35576 return true;
35577
35578 if (Val.getOpcode() != ISD::LOAD)
35579 return false;
35580
35581 if (!VT1.isSimple() || !VT1.isInteger() ||
35582 !VT2.isSimple() || !VT2.isInteger())
35583 return false;
35584
35585 switch (VT1.getSimpleVT().SimpleTy) {
35586 default: break;
35587 case MVT::i8:
35588 case MVT::i16:
35589 case MVT::i32:
35590 // X86 has 8, 16, and 32-bit zero-extending loads.
35591 return true;
35592 }
35593
35594 return false;
35595}
35596
35598 if (!Subtarget.is64Bit())
35599 return false;
35600 return TargetLowering::shouldConvertPhiType(From, To);
35601}
35602
35604 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35605 return false;
35606
35607 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35608
35609 // There is no extending load for vXi1.
35610 if (SrcVT.getScalarType() == MVT::i1)
35611 return false;
35612
35613 return true;
35614}
35615
35617 EVT VT) const {
35618 if (Subtarget.useSoftFloat())
35619 return false;
35620
35621 if (!Subtarget.hasAnyFMA())
35622 return false;
35623
35624 VT = VT.getScalarType();
35625
35626 if (!VT.isSimple())
35627 return false;
35628
35629 switch (VT.getSimpleVT().SimpleTy) {
35630 case MVT::f16:
35631 return Subtarget.hasFP16();
35632 case MVT::f32:
35633 case MVT::f64:
35634 return true;
35635 default:
35636 break;
35637 }
35638
35639 return false;
35640}
35641
35643 EVT DestVT) const {
35644 // i16 instructions are longer (0x66 prefix) and potentially slower.
35645 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35646}
35647
35649 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35650 SDValue Y) const {
35651 if (SelectOpcode == ISD::SELECT) {
35652 if (VT.isVector())
35653 return false;
35654 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35655 return false;
35656 using namespace llvm::SDPatternMatch;
35657 // BLSI
35658 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35660 return true;
35661 // BLSR
35662 if (BinOpcode == ISD::AND &&
35665 return true;
35666 // BLSMSK
35667 if (BinOpcode == ISD::XOR &&
35670 return true;
35671
35672 return false;
35673 }
35674 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35675 // benefit. The transform may also be profitable for scalar code.
35676 if (!Subtarget.hasAVX512())
35677 return false;
35678 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35679 return false;
35680 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35681 return false;
35682
35683 return true;
35684}
35685
35686/// Targets can use this to indicate that they only support *some*
35687/// VECTOR_SHUFFLE operations, those with specific masks.
35688/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35689/// are assumed to be legal.
35691 if (!VT.isSimple())
35692 return false;
35693
35694 // Not for i1 vectors
35695 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35696 return false;
35697
35698 // Very little shuffling can be done for 64-bit vectors right now.
35699 if (VT.getSimpleVT().getSizeInBits() == 64)
35700 return false;
35701
35702 // We only care that the types being shuffled are legal. The lowering can
35703 // handle any possible shuffle mask that results.
35704 return isTypeLegal(VT.getSimpleVT());
35705}
35706
35708 EVT VT) const {
35709 // Don't convert an 'and' into a shuffle that we don't directly support.
35710 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35711 if (!Subtarget.hasAVX2())
35712 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35713 return false;
35714
35715 // Just delegate to the generic legality, clear masks aren't special.
35716 return isShuffleMaskLegal(Mask, VT);
35717}
35718
35720 // If the subtarget is using thunks, we need to not generate jump tables.
35721 if (Subtarget.useIndirectThunkBranches())
35722 return false;
35723
35724 // Otherwise, fallback on the generic logic.
35726}
35727
35729 EVT ConditionVT) const {
35730 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35731 // zero-extensions.
35732 if (ConditionVT.getSizeInBits() < 32)
35733 return MVT::i32;
35735 ConditionVT);
35736}
35737
35738//===----------------------------------------------------------------------===//
35739// X86 Scheduler Hooks
35740//===----------------------------------------------------------------------===//
35741
35742/// Utility function to emit xbegin specifying the start of an RTM region.
35744 const TargetInstrInfo *TII) {
35745 const MIMetadata MIMD(MI);
35746
35747 const BasicBlock *BB = MBB->getBasicBlock();
35748 MachineFunction::iterator I = ++MBB->getIterator();
35749
35750 // For the v = xbegin(), we generate
35751 //
35752 // thisMBB:
35753 // xbegin sinkMBB
35754 //
35755 // mainMBB:
35756 // s0 = -1
35757 //
35758 // fallBB:
35759 // eax = # XABORT_DEF
35760 // s1 = eax
35761 //
35762 // sinkMBB:
35763 // v = phi(s0/mainBB, s1/fallBB)
35764
35765 MachineBasicBlock *thisMBB = MBB;
35766 MachineFunction *MF = MBB->getParent();
35767 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35768 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35769 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35770 MF->insert(I, mainMBB);
35771 MF->insert(I, fallMBB);
35772 MF->insert(I, sinkMBB);
35773
35774 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35775 mainMBB->addLiveIn(X86::EFLAGS);
35776 fallMBB->addLiveIn(X86::EFLAGS);
35777 sinkMBB->addLiveIn(X86::EFLAGS);
35778 }
35779
35780 // Transfer the remainder of BB and its successor edges to sinkMBB.
35781 sinkMBB->splice(sinkMBB->begin(), MBB,
35782 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35784
35786 Register DstReg = MI.getOperand(0).getReg();
35787 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35788 Register mainDstReg = MRI.createVirtualRegister(RC);
35789 Register fallDstReg = MRI.createVirtualRegister(RC);
35790
35791 // thisMBB:
35792 // xbegin fallMBB
35793 // # fallthrough to mainMBB
35794 // # abortion to fallMBB
35795 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35796 thisMBB->addSuccessor(mainMBB);
35797 thisMBB->addSuccessor(fallMBB);
35798
35799 // mainMBB:
35800 // mainDstReg := -1
35801 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35802 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35803 mainMBB->addSuccessor(sinkMBB);
35804
35805 // fallMBB:
35806 // ; pseudo instruction to model hardware's definition from XABORT
35807 // EAX := XABORT_DEF
35808 // fallDstReg := EAX
35809 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35810 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35811 .addReg(X86::EAX);
35812 fallMBB->addSuccessor(sinkMBB);
35813
35814 // sinkMBB:
35815 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35816 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35817 .addReg(mainDstReg).addMBB(mainMBB)
35818 .addReg(fallDstReg).addMBB(fallMBB);
35819
35820 MI.eraseFromParent();
35821 return sinkMBB;
35822}
35823
35825X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35826 MachineBasicBlock *MBB) const {
35827 // Emit va_arg instruction on X86-64.
35828
35829 // Operands to this pseudo-instruction:
35830 // 0 ) Output : destination address (reg)
35831 // 1-5) Input : va_list address (addr, i64mem)
35832 // 6 ) ArgSize : Size (in bytes) of vararg type
35833 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35834 // 8 ) Align : Alignment of type
35835 // 9 ) EFLAGS (implicit-def)
35836
35837 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35838 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35839
35840 Register DestReg = MI.getOperand(0).getReg();
35841 MachineOperand &Base = MI.getOperand(1);
35842 MachineOperand &Scale = MI.getOperand(2);
35843 MachineOperand &Index = MI.getOperand(3);
35844 MachineOperand &Disp = MI.getOperand(4);
35845 MachineOperand &Segment = MI.getOperand(5);
35846 unsigned ArgSize = MI.getOperand(6).getImm();
35847 unsigned ArgMode = MI.getOperand(7).getImm();
35848 Align Alignment = Align(MI.getOperand(8).getImm());
35849
35850 MachineFunction *MF = MBB->getParent();
35851
35852 // Memory Reference
35853 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35854
35855 MachineMemOperand *OldMMO = MI.memoperands().front();
35856
35857 // Clone the MMO into two separate MMOs for loading and storing
35858 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35859 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35860 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35861 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35862
35863 // Machine Information
35864 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35865 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35866 const TargetRegisterClass *AddrRegClass =
35868 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35869 const MIMetadata MIMD(MI);
35870
35871 // struct va_list {
35872 // i32 gp_offset
35873 // i32 fp_offset
35874 // i64 overflow_area (address)
35875 // i64 reg_save_area (address)
35876 // }
35877 // sizeof(va_list) = 24
35878 // alignment(va_list) = 8
35879
35880 unsigned TotalNumIntRegs = 6;
35881 unsigned TotalNumXMMRegs = 8;
35882 bool UseGPOffset = (ArgMode == 1);
35883 bool UseFPOffset = (ArgMode == 2);
35884 unsigned MaxOffset = TotalNumIntRegs * 8 +
35885 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35886
35887 /* Align ArgSize to a multiple of 8 */
35888 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35889 bool NeedsAlign = (Alignment > 8);
35890
35891 MachineBasicBlock *thisMBB = MBB;
35892 MachineBasicBlock *overflowMBB;
35893 MachineBasicBlock *offsetMBB;
35894 MachineBasicBlock *endMBB;
35895
35896 Register OffsetDestReg; // Argument address computed by offsetMBB
35897 Register OverflowDestReg; // Argument address computed by overflowMBB
35898 Register OffsetReg;
35899
35900 if (!UseGPOffset && !UseFPOffset) {
35901 // If we only pull from the overflow region, we don't create a branch.
35902 // We don't need to alter control flow.
35903 OffsetDestReg = Register(); // unused
35904 OverflowDestReg = DestReg;
35905
35906 offsetMBB = nullptr;
35907 overflowMBB = thisMBB;
35908 endMBB = thisMBB;
35909 } else {
35910 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35911 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35912 // If not, pull from overflow_area. (branch to overflowMBB)
35913 //
35914 // thisMBB
35915 // | .
35916 // | .
35917 // offsetMBB overflowMBB
35918 // | .
35919 // | .
35920 // endMBB
35921
35922 // Registers for the PHI in endMBB
35923 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35924 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35925
35926 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35927 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35928 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35929 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35930
35932
35933 // Insert the new basic blocks
35934 MF->insert(MBBIter, offsetMBB);
35935 MF->insert(MBBIter, overflowMBB);
35936 MF->insert(MBBIter, endMBB);
35937
35938 // Transfer the remainder of MBB and its successor edges to endMBB.
35939 endMBB->splice(endMBB->begin(), thisMBB,
35940 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35941 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35942
35943 // Make offsetMBB and overflowMBB successors of thisMBB
35944 thisMBB->addSuccessor(offsetMBB);
35945 thisMBB->addSuccessor(overflowMBB);
35946
35947 // endMBB is a successor of both offsetMBB and overflowMBB
35948 offsetMBB->addSuccessor(endMBB);
35949 overflowMBB->addSuccessor(endMBB);
35950
35951 // Load the offset value into a register
35952 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35953 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35954 .add(Base)
35955 .add(Scale)
35956 .add(Index)
35957 .addDisp(Disp, UseFPOffset ? 4 : 0)
35958 .add(Segment)
35959 .setMemRefs(LoadOnlyMMO);
35960
35961 // Check if there is enough room left to pull this argument.
35962 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35963 .addReg(OffsetReg)
35964 .addImm(MaxOffset + 8 - ArgSizeA8);
35965
35966 // Branch to "overflowMBB" if offset >= max
35967 // Fall through to "offsetMBB" otherwise
35968 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35969 .addMBB(overflowMBB).addImm(X86::COND_AE);
35970 }
35971
35972 // In offsetMBB, emit code to use the reg_save_area.
35973 if (offsetMBB) {
35974 assert(OffsetReg != 0);
35975
35976 // Read the reg_save_area address.
35977 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35978 BuildMI(
35979 offsetMBB, MIMD,
35980 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35981 RegSaveReg)
35982 .add(Base)
35983 .add(Scale)
35984 .add(Index)
35985 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35986 .add(Segment)
35987 .setMemRefs(LoadOnlyMMO);
35988
35989 if (Subtarget.isTarget64BitLP64()) {
35990 // Zero-extend the offset
35991 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35992 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35993 .addImm(0)
35994 .addReg(OffsetReg)
35995 .addImm(X86::sub_32bit);
35996
35997 // Add the offset to the reg_save_area to get the final address.
35998 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35999 .addReg(OffsetReg64)
36000 .addReg(RegSaveReg);
36001 } else {
36002 // Add the offset to the reg_save_area to get the final address.
36003 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
36004 .addReg(OffsetReg)
36005 .addReg(RegSaveReg);
36006 }
36007
36008 // Compute the offset for the next argument
36009 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36010 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
36011 .addReg(OffsetReg)
36012 .addImm(UseFPOffset ? 16 : 8);
36013
36014 // Store it back into the va_list.
36015 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36016 .add(Base)
36017 .add(Scale)
36018 .add(Index)
36019 .addDisp(Disp, UseFPOffset ? 4 : 0)
36020 .add(Segment)
36021 .addReg(NextOffsetReg)
36022 .setMemRefs(StoreOnlyMMO);
36023
36024 // Jump to endMBB
36025 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36026 .addMBB(endMBB);
36027 }
36028
36029 //
36030 // Emit code to use overflow area
36031 //
36032
36033 // Load the overflow_area address into a register.
36034 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36035 BuildMI(overflowMBB, MIMD,
36036 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36037 OverflowAddrReg)
36038 .add(Base)
36039 .add(Scale)
36040 .add(Index)
36041 .addDisp(Disp, 8)
36042 .add(Segment)
36043 .setMemRefs(LoadOnlyMMO);
36044
36045 // If we need to align it, do so. Otherwise, just copy the address
36046 // to OverflowDestReg.
36047 if (NeedsAlign) {
36048 // Align the overflow address
36049 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36050
36051 // aligned_addr = (addr + (align-1)) & ~(align-1)
36052 BuildMI(
36053 overflowMBB, MIMD,
36054 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36055 TmpReg)
36056 .addReg(OverflowAddrReg)
36057 .addImm(Alignment.value() - 1);
36058
36059 BuildMI(
36060 overflowMBB, MIMD,
36061 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36062 OverflowDestReg)
36063 .addReg(TmpReg)
36064 .addImm(~(uint64_t)(Alignment.value() - 1));
36065 } else {
36066 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36067 .addReg(OverflowAddrReg);
36068 }
36069
36070 // Compute the next overflow address after this argument.
36071 // (the overflow address should be kept 8-byte aligned)
36072 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36073 BuildMI(
36074 overflowMBB, MIMD,
36075 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36076 NextAddrReg)
36077 .addReg(OverflowDestReg)
36078 .addImm(ArgSizeA8);
36079
36080 // Store the new overflow address.
36081 BuildMI(overflowMBB, MIMD,
36082 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36083 .add(Base)
36084 .add(Scale)
36085 .add(Index)
36086 .addDisp(Disp, 8)
36087 .add(Segment)
36088 .addReg(NextAddrReg)
36089 .setMemRefs(StoreOnlyMMO);
36090
36091 // If we branched, emit the PHI to the front of endMBB.
36092 if (offsetMBB) {
36093 BuildMI(*endMBB, endMBB->begin(), MIMD,
36094 TII->get(X86::PHI), DestReg)
36095 .addReg(OffsetDestReg).addMBB(offsetMBB)
36096 .addReg(OverflowDestReg).addMBB(overflowMBB);
36097 }
36098
36099 // Erase the pseudo instruction
36100 MI.eraseFromParent();
36101
36102 return endMBB;
36103}
36104
36105// The EFLAGS operand of SelectItr might be missing a kill marker
36106// because there were multiple uses of EFLAGS, and ISel didn't know
36107// which to mark. Figure out whether SelectItr should have had a
36108// kill marker, and set it if it should. Returns the correct kill
36109// marker value.
36112 const TargetRegisterInfo* TRI) {
36113 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36114 return false;
36115
36116 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36117 // out. SelectMI should have a kill flag on EFLAGS.
36118 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36119 return true;
36120}
36121
36122// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36123// together with other CMOV pseudo-opcodes into a single basic-block with
36124// conditional jump around it.
36126 switch (MI.getOpcode()) {
36127 case X86::CMOV_FR16:
36128 case X86::CMOV_FR16X:
36129 case X86::CMOV_FR32:
36130 case X86::CMOV_FR32X:
36131 case X86::CMOV_FR64:
36132 case X86::CMOV_FR64X:
36133 case X86::CMOV_GR8:
36134 case X86::CMOV_GR16:
36135 case X86::CMOV_GR32:
36136 case X86::CMOV_RFP32:
36137 case X86::CMOV_RFP64:
36138 case X86::CMOV_RFP80:
36139 case X86::CMOV_VR64:
36140 case X86::CMOV_VR128:
36141 case X86::CMOV_VR128X:
36142 case X86::CMOV_VR256:
36143 case X86::CMOV_VR256X:
36144 case X86::CMOV_VR512:
36145 case X86::CMOV_VK1:
36146 case X86::CMOV_VK2:
36147 case X86::CMOV_VK4:
36148 case X86::CMOV_VK8:
36149 case X86::CMOV_VK16:
36150 case X86::CMOV_VK32:
36151 case X86::CMOV_VK64:
36152 return true;
36153
36154 default:
36155 return false;
36156 }
36157}
36158
36159// Helper function, which inserts PHI functions into SinkMBB:
36160// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36161// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36162// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36163// the last PHI function inserted.
36166 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36167 MachineBasicBlock *SinkMBB) {
36168 MachineFunction *MF = TrueMBB->getParent();
36170 const MIMetadata MIMD(*MIItBegin);
36171
36172 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36174
36175 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36176
36177 // As we are creating the PHIs, we have to be careful if there is more than
36178 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36179 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36180 // That also means that PHI construction must work forward from earlier to
36181 // later, and that the code must maintain a mapping from earlier PHI's
36182 // destination registers, and the registers that went into the PHI.
36185
36186 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36187 Register DestReg = MIIt->getOperand(0).getReg();
36188 Register Op1Reg = MIIt->getOperand(1).getReg();
36189 Register Op2Reg = MIIt->getOperand(2).getReg();
36190
36191 // If this CMOV we are generating is the opposite condition from
36192 // the jump we generated, then we have to swap the operands for the
36193 // PHI that is going to be generated.
36194 if (MIIt->getOperand(3).getImm() == OppCC)
36195 std::swap(Op1Reg, Op2Reg);
36196
36197 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36198 Op1Reg = It->second.first;
36199
36200 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36201 Op2Reg = It->second.second;
36202
36203 MIB =
36204 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36205 .addReg(Op1Reg)
36206 .addMBB(FalseMBB)
36207 .addReg(Op2Reg)
36208 .addMBB(TrueMBB);
36209
36210 // Add this PHI to the rewrite table.
36211 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36212 }
36213
36214 return MIB;
36215}
36216
36217// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36219X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36220 MachineInstr &SecondCascadedCMOV,
36221 MachineBasicBlock *ThisMBB) const {
36222 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36223 const MIMetadata MIMD(FirstCMOV);
36224
36225 // We lower cascaded CMOVs such as
36226 //
36227 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36228 //
36229 // to two successive branches.
36230 //
36231 // Without this, we would add a PHI between the two jumps, which ends up
36232 // creating a few copies all around. For instance, for
36233 //
36234 // (sitofp (zext (fcmp une)))
36235 //
36236 // we would generate:
36237 //
36238 // ucomiss %xmm1, %xmm0
36239 // movss <1.0f>, %xmm0
36240 // movaps %xmm0, %xmm1
36241 // jne .LBB5_2
36242 // xorps %xmm1, %xmm1
36243 // .LBB5_2:
36244 // jp .LBB5_4
36245 // movaps %xmm1, %xmm0
36246 // .LBB5_4:
36247 // retq
36248 //
36249 // because this custom-inserter would have generated:
36250 //
36251 // A
36252 // | \
36253 // | B
36254 // | /
36255 // C
36256 // | \
36257 // | D
36258 // | /
36259 // E
36260 //
36261 // A: X = ...; Y = ...
36262 // B: empty
36263 // C: Z = PHI [X, A], [Y, B]
36264 // D: empty
36265 // E: PHI [X, C], [Z, D]
36266 //
36267 // If we lower both CMOVs in a single step, we can instead generate:
36268 //
36269 // A
36270 // | \
36271 // | C
36272 // | /|
36273 // |/ |
36274 // | |
36275 // | D
36276 // | /
36277 // E
36278 //
36279 // A: X = ...; Y = ...
36280 // D: empty
36281 // E: PHI [X, A], [X, C], [Y, D]
36282 //
36283 // Which, in our sitofp/fcmp example, gives us something like:
36284 //
36285 // ucomiss %xmm1, %xmm0
36286 // movss <1.0f>, %xmm0
36287 // jne .LBB5_4
36288 // jp .LBB5_4
36289 // xorps %xmm0, %xmm0
36290 // .LBB5_4:
36291 // retq
36292 //
36293
36294 // We lower cascaded CMOV into two successive branches to the same block.
36295 // EFLAGS is used by both, so mark it as live in the second.
36296 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36297 MachineFunction *F = ThisMBB->getParent();
36298 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36299 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36300 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36301
36302 MachineFunction::iterator It = ++ThisMBB->getIterator();
36303 F->insert(It, FirstInsertedMBB);
36304 F->insert(It, SecondInsertedMBB);
36305 F->insert(It, SinkMBB);
36306
36307 // For a cascaded CMOV, we lower it to two successive branches to
36308 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36309 // the FirstInsertedMBB.
36310 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36311
36312 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36313 // live into the sink and copy blocks.
36314 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36315 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36316 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36317 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36318 SinkMBB->addLiveIn(X86::EFLAGS);
36319 }
36320
36321 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36322 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36323 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36324 ThisMBB->end());
36325 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36326
36327 // Fallthrough block for ThisMBB.
36328 ThisMBB->addSuccessor(FirstInsertedMBB);
36329 // The true block target of the first branch is always SinkMBB.
36330 ThisMBB->addSuccessor(SinkMBB);
36331 // Fallthrough block for FirstInsertedMBB.
36332 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36333 // The true block for the branch of FirstInsertedMBB.
36334 FirstInsertedMBB->addSuccessor(SinkMBB);
36335 // This is fallthrough.
36336 SecondInsertedMBB->addSuccessor(SinkMBB);
36337
36338 // Create the conditional branch instructions.
36339 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36340 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36341
36342 X86::CondCode SecondCC =
36343 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36344 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36345 .addMBB(SinkMBB)
36346 .addImm(SecondCC);
36347
36348 // SinkMBB:
36349 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36350 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36351 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36352 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36353 MachineInstrBuilder MIB =
36354 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36355 .addReg(Op1Reg)
36356 .addMBB(SecondInsertedMBB)
36357 .addReg(Op2Reg)
36358 .addMBB(ThisMBB);
36359
36360 // The second SecondInsertedMBB provides the same incoming value as the
36361 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36362 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36363
36364 // Now remove the CMOVs.
36365 FirstCMOV.eraseFromParent();
36366 SecondCascadedCMOV.eraseFromParent();
36367
36368 return SinkMBB;
36369}
36370
36372X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36373 MachineBasicBlock *ThisMBB) const {
36374 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36375 const MIMetadata MIMD(MI);
36376
36377 // To "insert" a SELECT_CC instruction, we actually have to insert the
36378 // diamond control-flow pattern. The incoming instruction knows the
36379 // destination vreg to set, the condition code register to branch on, the
36380 // true/false values to select between and a branch opcode to use.
36381
36382 // ThisMBB:
36383 // ...
36384 // TrueVal = ...
36385 // cmpTY ccX, r1, r2
36386 // bCC copy1MBB
36387 // fallthrough --> FalseMBB
36388
36389 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36390 // as described above, by inserting a BB, and then making a PHI at the join
36391 // point to select the true and false operands of the CMOV in the PHI.
36392 //
36393 // The code also handles two different cases of multiple CMOV opcodes
36394 // in a row.
36395 //
36396 // Case 1:
36397 // In this case, there are multiple CMOVs in a row, all which are based on
36398 // the same condition setting (or the exact opposite condition setting).
36399 // In this case we can lower all the CMOVs using a single inserted BB, and
36400 // then make a number of PHIs at the join point to model the CMOVs. The only
36401 // trickiness here, is that in a case like:
36402 //
36403 // t2 = CMOV cond1 t1, f1
36404 // t3 = CMOV cond1 t2, f2
36405 //
36406 // when rewriting this into PHIs, we have to perform some renaming on the
36407 // temps since you cannot have a PHI operand refer to a PHI result earlier
36408 // in the same block. The "simple" but wrong lowering would be:
36409 //
36410 // t2 = PHI t1(BB1), f1(BB2)
36411 // t3 = PHI t2(BB1), f2(BB2)
36412 //
36413 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36414 // renaming is to note that on the path through BB1, t2 is really just a
36415 // copy of t1, and do that renaming, properly generating:
36416 //
36417 // t2 = PHI t1(BB1), f1(BB2)
36418 // t3 = PHI t1(BB1), f2(BB2)
36419 //
36420 // Case 2:
36421 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36422 // function - EmitLoweredCascadedSelect.
36423
36424 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36426 MachineInstr *LastCMOV = &MI;
36428
36429 // Check for case 1, where there are multiple CMOVs with the same condition
36430 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36431 // number of jumps the most.
36432
36433 if (isCMOVPseudo(MI)) {
36434 // See if we have a string of CMOVS with the same condition. Skip over
36435 // intervening debug insts.
36436 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36437 (NextMIIt->getOperand(3).getImm() == CC ||
36438 NextMIIt->getOperand(3).getImm() == OppCC)) {
36439 LastCMOV = &*NextMIIt;
36440 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36441 }
36442 }
36443
36444 // This checks for case 2, but only do this if we didn't already find
36445 // case 1, as indicated by LastCMOV == MI.
36446 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36447 NextMIIt->getOpcode() == MI.getOpcode() &&
36448 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36449 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36450 NextMIIt->getOperand(1).isKill()) {
36451 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36452 }
36453
36454 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36455 MachineFunction *F = ThisMBB->getParent();
36456 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36457 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36458
36459 MachineFunction::iterator It = ++ThisMBB->getIterator();
36460 F->insert(It, FalseMBB);
36461 F->insert(It, SinkMBB);
36462
36463 // Set the call frame size on entry to the new basic blocks.
36464 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36465 FalseMBB->setCallFrameSize(CallFrameSize);
36466 SinkMBB->setCallFrameSize(CallFrameSize);
36467
36468 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36469 // live into the sink and copy blocks.
36470 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36471 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36472 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36473 FalseMBB->addLiveIn(X86::EFLAGS);
36474 SinkMBB->addLiveIn(X86::EFLAGS);
36475 }
36476
36477 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36479 MachineBasicBlock::iterator(LastCMOV));
36480 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36481 if (MI.isDebugInstr())
36482 SinkMBB->push_back(MI.removeFromParent());
36483
36484 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36485 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36486 std::next(MachineBasicBlock::iterator(LastCMOV)),
36487 ThisMBB->end());
36488 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36489
36490 // Fallthrough block for ThisMBB.
36491 ThisMBB->addSuccessor(FalseMBB);
36492 // The true block target of the first (or only) branch is always a SinkMBB.
36493 ThisMBB->addSuccessor(SinkMBB);
36494 // Fallthrough block for FalseMBB.
36495 FalseMBB->addSuccessor(SinkMBB);
36496
36497 // Create the conditional branch instruction.
36498 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36499
36500 // SinkMBB:
36501 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36502 // ...
36505 std::next(MachineBasicBlock::iterator(LastCMOV));
36506 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36507
36508 // Now remove the CMOV(s).
36509 ThisMBB->erase(MIItBegin, MIItEnd);
36510
36511 return SinkMBB;
36512}
36513
36514static unsigned getSUBriOpcode(bool IsLP64) {
36515 if (IsLP64)
36516 return X86::SUB64ri32;
36517 else
36518 return X86::SUB32ri;
36519}
36520
36522X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36523 MachineBasicBlock *MBB) const {
36524 MachineFunction *MF = MBB->getParent();
36525 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36526 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36527 const MIMetadata MIMD(MI);
36528 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36529
36530 const unsigned ProbeSize = getStackProbeSize(*MF);
36531
36532 MachineRegisterInfo &MRI = MF->getRegInfo();
36533 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36534 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36535 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36536
36538 MF->insert(MBBIter, testMBB);
36539 MF->insert(MBBIter, blockMBB);
36540 MF->insert(MBBIter, tailMBB);
36541
36542 Register sizeVReg = MI.getOperand(1).getReg();
36543
36544 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36545
36546 Register TmpStackPtr = MRI.createVirtualRegister(
36547 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36548 Register FinalStackPtr = MRI.createVirtualRegister(
36549 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36550
36551 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36552 .addReg(physSPReg);
36553 {
36554 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36555 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36556 .addReg(TmpStackPtr)
36557 .addReg(sizeVReg);
36558 }
36559
36560 // test rsp size
36561
36562 BuildMI(testMBB, MIMD,
36563 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36564 .addReg(FinalStackPtr)
36565 .addReg(physSPReg);
36566
36567 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36568 .addMBB(tailMBB)
36570 testMBB->addSuccessor(blockMBB);
36571 testMBB->addSuccessor(tailMBB);
36572
36573 // Touch the block then extend it. This is done on the opposite side of
36574 // static probe where we allocate then touch, to avoid the need of probing the
36575 // tail of the static alloca. Possible scenarios are:
36576 //
36577 // + ---- <- ------------ <- ------------- <- ------------ +
36578 // | |
36579 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36580 // | |
36581 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36582 //
36583 // The property we want to enforce is to never have more than [page alloc] between two probes.
36584
36585 const unsigned XORMIOpc =
36586 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36587 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36588 .addImm(0);
36589
36590 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36591 physSPReg)
36592 .addReg(physSPReg)
36593 .addImm(ProbeSize);
36594
36595 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36596 blockMBB->addSuccessor(testMBB);
36597
36598 // Replace original instruction by the expected stack ptr
36599 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36600 MI.getOperand(0).getReg())
36601 .addReg(FinalStackPtr);
36602
36603 tailMBB->splice(tailMBB->end(), MBB,
36604 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36606 MBB->addSuccessor(testMBB);
36607
36608 // Delete the original pseudo instruction.
36609 MI.eraseFromParent();
36610
36611 // And we're done.
36612 return tailMBB;
36613}
36614
36616X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36617 MachineBasicBlock *BB) const {
36618 MachineFunction *MF = BB->getParent();
36619 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36620 const MIMetadata MIMD(MI);
36621 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36622
36623 assert(MF->shouldSplitStack());
36624
36625 const bool Is64Bit = Subtarget.is64Bit();
36626 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36627
36628 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36629 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36630
36631 // BB:
36632 // ... [Till the alloca]
36633 // If stacklet is not large enough, jump to mallocMBB
36634 //
36635 // bumpMBB:
36636 // Allocate by subtracting from RSP
36637 // Jump to continueMBB
36638 //
36639 // mallocMBB:
36640 // Allocate by call to runtime
36641 //
36642 // continueMBB:
36643 // ...
36644 // [rest of original BB]
36645 //
36646
36647 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36648 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36649 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36650
36651 MachineRegisterInfo &MRI = MF->getRegInfo();
36652 const TargetRegisterClass *AddrRegClass =
36654
36655 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36656 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36657 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36658 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36659 sizeVReg = MI.getOperand(1).getReg(),
36660 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36661
36662 MachineFunction::iterator MBBIter = ++BB->getIterator();
36663
36664 MF->insert(MBBIter, bumpMBB);
36665 MF->insert(MBBIter, mallocMBB);
36666 MF->insert(MBBIter, continueMBB);
36667
36668 continueMBB->splice(continueMBB->begin(), BB,
36669 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36670 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36671
36672 // Add code to the main basic block to check if the stack limit has been hit,
36673 // and if so, jump to mallocMBB otherwise to bumpMBB.
36674 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36675 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36676 .addReg(tmpSPVReg).addReg(sizeVReg);
36677 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36678 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36679 .addReg(SPLimitVReg);
36680 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36681
36682 // bumpMBB simply decreases the stack pointer, since we know the current
36683 // stacklet has enough space.
36684 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36685 .addReg(SPLimitVReg);
36686 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36687 .addReg(SPLimitVReg);
36688 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36689
36690 // Calls into a routine in libgcc to allocate more space from the heap.
36691 const uint32_t *RegMask =
36692 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36693 if (IsLP64) {
36694 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36695 .addReg(sizeVReg);
36696 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36697 .addExternalSymbol("__morestack_allocate_stack_space")
36698 .addRegMask(RegMask)
36699 .addReg(X86::RDI, RegState::Implicit)
36700 .addReg(X86::RAX, RegState::ImplicitDefine);
36701 } else if (Is64Bit) {
36702 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36703 .addReg(sizeVReg);
36704 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36705 .addExternalSymbol("__morestack_allocate_stack_space")
36706 .addRegMask(RegMask)
36707 .addReg(X86::EDI, RegState::Implicit)
36708 .addReg(X86::EAX, RegState::ImplicitDefine);
36709 } else {
36710 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36711 .addImm(12);
36712 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36713 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36714 .addExternalSymbol("__morestack_allocate_stack_space")
36715 .addRegMask(RegMask)
36716 .addReg(X86::EAX, RegState::ImplicitDefine);
36717 }
36718
36719 if (!Is64Bit)
36720 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36721 .addImm(16);
36722
36723 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36724 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36725 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36726
36727 // Set up the CFG correctly.
36728 BB->addSuccessor(bumpMBB);
36729 BB->addSuccessor(mallocMBB);
36730 mallocMBB->addSuccessor(continueMBB);
36731 bumpMBB->addSuccessor(continueMBB);
36732
36733 // Take care of the PHI nodes.
36734 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36735 MI.getOperand(0).getReg())
36736 .addReg(mallocPtrVReg)
36737 .addMBB(mallocMBB)
36738 .addReg(bumpSPPtrVReg)
36739 .addMBB(bumpMBB);
36740
36741 // Delete the original pseudo instruction.
36742 MI.eraseFromParent();
36743
36744 // And we're done.
36745 return continueMBB;
36746}
36747
36749X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36750 MachineBasicBlock *BB) const {
36751 MachineFunction *MF = BB->getParent();
36752 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36753 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36754 const MIMetadata MIMD(MI);
36755
36758 "SEH does not use catchret!");
36759
36760 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36761 if (!Subtarget.is32Bit())
36762 return BB;
36763
36764 // C++ EH creates a new target block to hold the restore code, and wires up
36765 // the new block to the return destination with a normal JMP_4.
36766 MachineBasicBlock *RestoreMBB =
36768 assert(BB->succ_size() == 1);
36769 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36770 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36771 BB->addSuccessor(RestoreMBB);
36772 MI.getOperand(0).setMBB(RestoreMBB);
36773
36774 // Marking this as an EH pad but not a funclet entry block causes PEI to
36775 // restore stack pointers in the block.
36776 RestoreMBB->setIsEHPad(true);
36777
36778 auto RestoreMBBI = RestoreMBB->begin();
36779 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36780 return BB;
36781}
36782
36784X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36785 MachineBasicBlock *BB) const {
36786 // This is pretty easy. We're taking the value that we received from
36787 // our load from the relocation, sticking it in either RDI (x86-64)
36788 // or EAX and doing an indirect call. The return value will then
36789 // be in the normal return register.
36790 MachineFunction *F = BB->getParent();
36791 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36792 const MIMetadata MIMD(MI);
36793
36794 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36795 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36796
36797 // Get a register mask for the lowered call.
36798 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36799 // proper register mask.
36800 const uint32_t *RegMask =
36801 Subtarget.is64Bit() ?
36802 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36803 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36804 if (Subtarget.is64Bit()) {
36805 MachineInstrBuilder MIB =
36806 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36807 .addReg(X86::RIP)
36808 .addImm(0)
36809 .addReg(0)
36810 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36811 MI.getOperand(3).getTargetFlags())
36812 .addReg(0);
36813 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36814 addDirectMem(MIB, X86::RDI);
36815 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36816 } else if (!isPositionIndependent()) {
36817 MachineInstrBuilder MIB =
36818 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36819 .addReg(0)
36820 .addImm(0)
36821 .addReg(0)
36822 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36823 MI.getOperand(3).getTargetFlags())
36824 .addReg(0);
36825 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36826 addDirectMem(MIB, X86::EAX);
36827 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36828 } else {
36829 MachineInstrBuilder MIB =
36830 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36831 .addReg(TII->getGlobalBaseReg(F))
36832 .addImm(0)
36833 .addReg(0)
36834 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36835 MI.getOperand(3).getTargetFlags())
36836 .addReg(0);
36837 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36838 addDirectMem(MIB, X86::EAX);
36839 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36840 }
36841
36842 MI.eraseFromParent(); // The pseudo instruction is gone now.
36843 return BB;
36844}
36845
36846static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36847 switch (RPOpc) {
36848 case X86::INDIRECT_THUNK_CALL32:
36849 return X86::CALLpcrel32;
36850 case X86::INDIRECT_THUNK_CALL64:
36851 return X86::CALL64pcrel32;
36852 case X86::INDIRECT_THUNK_TCRETURN32:
36853 return X86::TCRETURNdi;
36854 case X86::INDIRECT_THUNK_TCRETURN64:
36855 return X86::TCRETURNdi64;
36856 }
36857 llvm_unreachable("not indirect thunk opcode");
36858}
36859
36860static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36861 Register Reg) {
36862 if (Subtarget.useRetpolineExternalThunk()) {
36863 // When using an external thunk for retpolines, we pick names that match the
36864 // names GCC happens to use as well. This helps simplify the implementation
36865 // of the thunks for kernels where they have no easy ability to create
36866 // aliases and are doing non-trivial configuration of the thunk's body. For
36867 // example, the Linux kernel will do boot-time hot patching of the thunk
36868 // bodies and cannot easily export aliases of these to loaded modules.
36869 //
36870 // Note that at any point in the future, we may need to change the semantics
36871 // of how we implement retpolines and at that time will likely change the
36872 // name of the called thunk. Essentially, there is no hard guarantee that
36873 // LLVM will generate calls to specific thunks, we merely make a best-effort
36874 // attempt to help out kernels and other systems where duplicating the
36875 // thunks is costly.
36876 switch (Reg.id()) {
36877 case X86::EAX:
36878 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36879 return "__x86_indirect_thunk_eax";
36880 case X86::ECX:
36881 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36882 return "__x86_indirect_thunk_ecx";
36883 case X86::EDX:
36884 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36885 return "__x86_indirect_thunk_edx";
36886 case X86::EDI:
36887 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36888 return "__x86_indirect_thunk_edi";
36889 case X86::R11:
36890 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36891 return "__x86_indirect_thunk_r11";
36892 }
36893 llvm_unreachable("unexpected reg for external indirect thunk");
36894 }
36895
36896 if (Subtarget.useRetpolineIndirectCalls() ||
36897 Subtarget.useRetpolineIndirectBranches()) {
36898 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36899 switch (Reg.id()) {
36900 case X86::EAX:
36901 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36902 return "__llvm_retpoline_eax";
36903 case X86::ECX:
36904 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36905 return "__llvm_retpoline_ecx";
36906 case X86::EDX:
36907 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36908 return "__llvm_retpoline_edx";
36909 case X86::EDI:
36910 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36911 return "__llvm_retpoline_edi";
36912 case X86::R11:
36913 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36914 return "__llvm_retpoline_r11";
36915 }
36916 llvm_unreachable("unexpected reg for retpoline");
36917 }
36918
36919 if (Subtarget.useLVIControlFlowIntegrity()) {
36920 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36921 return "__llvm_lvi_thunk_r11";
36922 }
36923 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36924}
36925
36927X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36928 MachineBasicBlock *BB) const {
36929 // Copy the virtual register into the R11 physical register and
36930 // call the retpoline thunk.
36931 const MIMetadata MIMD(MI);
36932 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36933 Register CalleeVReg = MI.getOperand(0).getReg();
36934 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36935
36936 // Find an available scratch register to hold the callee. On 64-bit, we can
36937 // just use R11, but we scan for uses anyway to ensure we don't generate
36938 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36939 // already a register use operand to the call to hold the callee. If none
36940 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36941 // register and ESI is the base pointer to realigned stack frames with VLAs.
36942 SmallVector<Register, 3> AvailableRegs;
36943 if (Subtarget.is64Bit())
36944 AvailableRegs.push_back(X86::R11);
36945 else
36946 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36947
36948 // Zero out any registers that are already used.
36949 for (const auto &MO : MI.operands()) {
36950 if (MO.isReg() && MO.isUse())
36951 llvm::replace(AvailableRegs, MO.getReg(), Register());
36952 }
36953
36954 // Choose the first remaining non-zero available register.
36955 Register AvailableReg;
36956 for (Register MaybeReg : AvailableRegs) {
36957 if (MaybeReg) {
36958 AvailableReg = MaybeReg;
36959 break;
36960 }
36961 }
36962 if (!AvailableReg)
36963 report_fatal_error("calling convention incompatible with retpoline, no "
36964 "available registers");
36965
36966 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36967
36968 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36969 .addReg(CalleeVReg);
36970 MI.getOperand(0).ChangeToES(Symbol);
36971 MI.setDesc(TII->get(Opc));
36972 MachineInstrBuilder(*BB->getParent(), &MI)
36973 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36974 return BB;
36975}
36976
36977/// SetJmp implies future control flow change upon calling the corresponding
36978/// LongJmp.
36979/// Instead of using the 'return' instruction, the long jump fixes the stack and
36980/// performs an indirect branch. To do so it uses the registers that were stored
36981/// in the jump buffer (when calling SetJmp).
36982/// In case the shadow stack is enabled we need to fix it as well, because some
36983/// return addresses will be skipped.
36984/// The function will save the SSP for future fixing in the function
36985/// emitLongJmpShadowStackFix.
36986/// \sa emitLongJmpShadowStackFix
36987/// \param [in] MI The temporary Machine Instruction for the builtin.
36988/// \param [in] MBB The Machine Basic Block that will be modified.
36989void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36990 MachineBasicBlock *MBB) const {
36991 const MIMetadata MIMD(MI);
36992 MachineFunction *MF = MBB->getParent();
36993 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36994 MachineRegisterInfo &MRI = MF->getRegInfo();
36995 MachineInstrBuilder MIB;
36996
36997 // Memory Reference.
36998 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36999
37000 // Initialize a register with zero.
37001 MVT PVT = getPointerTy(MF->getDataLayout());
37002 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37003 Register ZReg = MRI.createVirtualRegister(PtrRC);
37004 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37005 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
37006 .addDef(ZReg)
37007 .addReg(ZReg, RegState::Undef)
37008 .addReg(ZReg, RegState::Undef);
37009
37010 // Read the current SSP Register value to the zeroed register.
37011 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37012 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37013 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37014
37015 // Write the SSP register value to offset 3 in input memory buffer.
37016 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37017 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37018 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37019 const unsigned MemOpndSlot = 1;
37020 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37021 if (i == X86::AddrDisp)
37022 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37023 else
37024 MIB.add(MI.getOperand(MemOpndSlot + i));
37025 }
37026 MIB.addReg(SSPCopyReg);
37027 MIB.setMemRefs(MMOs);
37028}
37029
37031X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37032 MachineBasicBlock *MBB) const {
37033 const MIMetadata MIMD(MI);
37034 MachineFunction *MF = MBB->getParent();
37035 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37036 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37037 MachineRegisterInfo &MRI = MF->getRegInfo();
37038
37039 const BasicBlock *BB = MBB->getBasicBlock();
37041
37042 // Memory Reference
37043 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37044
37045 unsigned MemOpndSlot = 0;
37046
37047 unsigned CurOp = 0;
37048
37049 Register DstReg = MI.getOperand(CurOp++).getReg();
37050 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37051 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37052 (void)TRI;
37053 Register mainDstReg = MRI.createVirtualRegister(RC);
37054 Register restoreDstReg = MRI.createVirtualRegister(RC);
37055
37056 MemOpndSlot = CurOp;
37057
37058 MVT PVT = getPointerTy(MF->getDataLayout());
37059 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37060 "Invalid Pointer Size!");
37061
37062 // For v = setjmp(buf), we generate
37063 //
37064 // thisMBB:
37065 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37066 // SjLjSetup restoreMBB
37067 //
37068 // mainMBB:
37069 // v_main = 0
37070 //
37071 // sinkMBB:
37072 // v = phi(main, restore)
37073 //
37074 // restoreMBB:
37075 // if base pointer being used, load it from frame
37076 // v_restore = 1
37077
37078 MachineBasicBlock *thisMBB = MBB;
37079 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37080 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37081 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37082 MF->insert(I, mainMBB);
37083 MF->insert(I, sinkMBB);
37084 MF->push_back(restoreMBB);
37085 restoreMBB->setMachineBlockAddressTaken();
37086
37087 MachineInstrBuilder MIB;
37088
37089 // Transfer the remainder of BB and its successor edges to sinkMBB.
37090 sinkMBB->splice(sinkMBB->begin(), MBB,
37091 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37093
37094 // thisMBB:
37095 unsigned PtrStoreOpc = 0;
37096 Register LabelReg;
37097 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37098 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37100
37101 // Prepare IP either in reg or imm.
37102 if (!UseImmLabel) {
37103 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37104 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37105 LabelReg = MRI.createVirtualRegister(PtrRC);
37106 if (Subtarget.is64Bit()) {
37107 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37108 .addReg(X86::RIP)
37109 .addImm(0)
37110 .addReg(0)
37111 .addMBB(restoreMBB)
37112 .addReg(0);
37113 } else {
37114 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37115 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37116 .addReg(XII->getGlobalBaseReg(MF))
37117 .addImm(0)
37118 .addReg(0)
37119 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37120 .addReg(0);
37121 }
37122 } else
37123 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37124 // Store IP
37125 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37126 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37127 if (i == X86::AddrDisp)
37128 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37129 else
37130 MIB.add(MI.getOperand(MemOpndSlot + i));
37131 }
37132 if (!UseImmLabel)
37133 MIB.addReg(LabelReg);
37134 else
37135 MIB.addMBB(restoreMBB);
37136 MIB.setMemRefs(MMOs);
37137
37138 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37139 emitSetJmpShadowStackFix(MI, thisMBB);
37140 }
37141
37142 // Setup
37143 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37144 .addMBB(restoreMBB);
37145
37146 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37147 MIB.addRegMask(RegInfo->getNoPreservedMask());
37148 thisMBB->addSuccessor(mainMBB);
37149 thisMBB->addSuccessor(restoreMBB);
37150
37151 // mainMBB:
37152 // EAX = 0
37153 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37154 mainMBB->addSuccessor(sinkMBB);
37155
37156 // sinkMBB:
37157 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37158 .addReg(mainDstReg)
37159 .addMBB(mainMBB)
37160 .addReg(restoreDstReg)
37161 .addMBB(restoreMBB);
37162
37163 // restoreMBB:
37164 if (RegInfo->hasBasePointer(*MF)) {
37165 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37166 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37167 X86FI->setRestoreBasePointer(MF);
37168 Register FramePtr = RegInfo->getFrameRegister(*MF);
37169 Register BasePtr = RegInfo->getBaseRegister();
37170 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37171 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37172 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37174 }
37175 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37176 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37177 restoreMBB->addSuccessor(sinkMBB);
37178
37179 MI.eraseFromParent();
37180 return sinkMBB;
37181}
37182
37183/// Fix the shadow stack using the previously saved SSP pointer.
37184/// \sa emitSetJmpShadowStackFix
37185/// \param [in] MI The temporary Machine Instruction for the builtin.
37186/// \param [in] MBB The Machine Basic Block that will be modified.
37187/// \return The sink MBB that will perform the future indirect branch.
37189X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37190 MachineBasicBlock *MBB) const {
37191 const MIMetadata MIMD(MI);
37192 MachineFunction *MF = MBB->getParent();
37193 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37194 MachineRegisterInfo &MRI = MF->getRegInfo();
37195
37196 // Memory Reference
37197 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37198
37199 MVT PVT = getPointerTy(MF->getDataLayout());
37200 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37201
37202 // checkSspMBB:
37203 // xor vreg1, vreg1
37204 // rdssp vreg1
37205 // test vreg1, vreg1
37206 // je sinkMBB # Jump if Shadow Stack is not supported
37207 // fallMBB:
37208 // mov buf+24/12(%rip), vreg2
37209 // sub vreg1, vreg2
37210 // jbe sinkMBB # No need to fix the Shadow Stack
37211 // fixShadowMBB:
37212 // shr 3/2, vreg2
37213 // incssp vreg2 # fix the SSP according to the lower 8 bits
37214 // shr 8, vreg2
37215 // je sinkMBB
37216 // fixShadowLoopPrepareMBB:
37217 // shl vreg2
37218 // mov 128, vreg3
37219 // fixShadowLoopMBB:
37220 // incssp vreg3
37221 // dec vreg2
37222 // jne fixShadowLoopMBB # Iterate until you finish fixing
37223 // # the Shadow Stack
37224 // sinkMBB:
37225
37227 const BasicBlock *BB = MBB->getBasicBlock();
37228
37229 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37230 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37231 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37232 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37233 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37234 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37235 MF->insert(I, checkSspMBB);
37236 MF->insert(I, fallMBB);
37237 MF->insert(I, fixShadowMBB);
37238 MF->insert(I, fixShadowLoopPrepareMBB);
37239 MF->insert(I, fixShadowLoopMBB);
37240 MF->insert(I, sinkMBB);
37241
37242 // Transfer the remainder of BB and its successor edges to sinkMBB.
37243 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37244 MBB->end());
37246
37247 MBB->addSuccessor(checkSspMBB);
37248
37249 // Initialize a register with zero.
37250 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37251 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37252
37253 if (PVT == MVT::i64) {
37254 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37255 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37256 .addImm(0)
37257 .addReg(ZReg)
37258 .addImm(X86::sub_32bit);
37259 ZReg = TmpZReg;
37260 }
37261
37262 // Read the current SSP Register value to the zeroed register.
37263 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37264 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37265 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37266
37267 // Check whether the result of the SSP register is zero and jump directly
37268 // to the sink.
37269 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37270 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37271 .addReg(SSPCopyReg)
37272 .addReg(SSPCopyReg);
37273 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37274 .addMBB(sinkMBB)
37276 checkSspMBB->addSuccessor(sinkMBB);
37277 checkSspMBB->addSuccessor(fallMBB);
37278
37279 // Reload the previously saved SSP register value.
37280 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37281 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37282 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37283 MachineInstrBuilder MIB =
37284 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37285 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37286 const MachineOperand &MO = MI.getOperand(i);
37287 if (i == X86::AddrDisp)
37288 MIB.addDisp(MO, SPPOffset);
37289 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37290 // preserve kill flags.
37291 MIB.addReg(MO.getReg());
37292 else
37293 MIB.add(MO);
37294 }
37295 MIB.setMemRefs(MMOs);
37296
37297 // Subtract the current SSP from the previous SSP.
37298 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37299 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37300 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37301 .addReg(PrevSSPReg)
37302 .addReg(SSPCopyReg);
37303
37304 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37305 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37306 .addMBB(sinkMBB)
37308 fallMBB->addSuccessor(sinkMBB);
37309 fallMBB->addSuccessor(fixShadowMBB);
37310
37311 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37312 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37313 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37314 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37315 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37316 .addReg(SspSubReg)
37317 .addImm(Offset);
37318
37319 // Increase SSP when looking only on the lower 8 bits of the delta.
37320 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37321 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37322
37323 // Reset the lower 8 bits.
37324 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37325 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37326 .addReg(SspFirstShrReg)
37327 .addImm(8);
37328
37329 // Jump if the result of the shift is zero.
37330 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37331 .addMBB(sinkMBB)
37333 fixShadowMBB->addSuccessor(sinkMBB);
37334 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37335
37336 // Do a single shift left.
37337 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37338 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37339 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37340 .addReg(SspSecondShrReg)
37341 .addImm(1);
37342
37343 // Save the value 128 to a register (will be used next with incssp).
37344 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37345 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37346 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37347 .addImm(128);
37348 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37349
37350 // Since incssp only looks at the lower 8 bits, we might need to do several
37351 // iterations of incssp until we finish fixing the shadow stack.
37352 Register DecReg = MRI.createVirtualRegister(PtrRC);
37353 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37354 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37355 .addReg(SspAfterShlReg)
37356 .addMBB(fixShadowLoopPrepareMBB)
37357 .addReg(DecReg)
37358 .addMBB(fixShadowLoopMBB);
37359
37360 // Every iteration we increase the SSP by 128.
37361 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37362
37363 // Every iteration we decrement the counter by 1.
37364 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37365 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37366
37367 // Jump if the counter is not zero yet.
37368 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37369 .addMBB(fixShadowLoopMBB)
37371 fixShadowLoopMBB->addSuccessor(sinkMBB);
37372 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37373
37374 return sinkMBB;
37375}
37376
37378X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37379 MachineBasicBlock *MBB) const {
37380 const MIMetadata MIMD(MI);
37381 MachineFunction *MF = MBB->getParent();
37382 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37383 MachineRegisterInfo &MRI = MF->getRegInfo();
37384
37385 // Memory Reference
37386 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37387
37388 MVT PVT = getPointerTy(MF->getDataLayout());
37389 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37390 "Invalid Pointer Size!");
37391
37392 const TargetRegisterClass *RC =
37393 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37394 Register Tmp = MRI.createVirtualRegister(RC);
37395 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37396 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37397 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37398 Register SP = RegInfo->getStackRegister();
37399
37400 MachineInstrBuilder MIB;
37401
37402 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37403 const int64_t SPOffset = 2 * PVT.getStoreSize();
37404
37405 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37406 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37407
37408 MachineBasicBlock *thisMBB = MBB;
37409
37410 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37411 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37412 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37413 }
37414
37415 // Reload FP
37416 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37417 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37418 const MachineOperand &MO = MI.getOperand(i);
37419 if (MO.isReg()) // Don't add the whole operand, we don't want to
37420 // preserve kill flags.
37421 MIB.addReg(MO.getReg());
37422 else
37423 MIB.add(MO);
37424 }
37425 MIB.setMemRefs(MMOs);
37427
37428 // Reload IP
37429 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37430 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37431 const MachineOperand &MO = MI.getOperand(i);
37432 if (i == X86::AddrDisp)
37433 MIB.addDisp(MO, LabelOffset);
37434 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37435 // preserve kill flags.
37436 MIB.addReg(MO.getReg());
37437 else
37438 MIB.add(MO);
37439 }
37440 MIB.setMemRefs(MMOs);
37441
37442 // Reload SP
37443 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37444 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37445 if (i == X86::AddrDisp)
37446 MIB.addDisp(MI.getOperand(i), SPOffset);
37447 else
37448 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37449 // the last instruction of the expansion.
37450 }
37451 MIB.setMemRefs(MMOs);
37453
37454 // Jump
37455 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37456
37457 MI.eraseFromParent();
37458 return thisMBB;
37459}
37460
37461void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37463 MachineBasicBlock *DispatchBB,
37464 int FI) const {
37465 const MIMetadata MIMD(MI);
37466 MachineFunction *MF = MBB->getParent();
37467 MachineRegisterInfo *MRI = &MF->getRegInfo();
37468 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37469
37470 MVT PVT = getPointerTy(MF->getDataLayout());
37471 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37472
37473 unsigned Op = 0;
37474 Register VR;
37475
37476 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37478
37479 if (UseImmLabel) {
37480 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37481 } else {
37482 const TargetRegisterClass *TRC =
37483 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37484 VR = MRI->createVirtualRegister(TRC);
37485 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37486
37487 if (Subtarget.is64Bit())
37488 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37489 .addReg(X86::RIP)
37490 .addImm(1)
37491 .addReg(0)
37492 .addMBB(DispatchBB)
37493 .addReg(0);
37494 else
37495 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37496 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37497 .addImm(1)
37498 .addReg(0)
37499 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37500 .addReg(0);
37501 }
37502
37503 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37504 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37505 if (UseImmLabel)
37506 MIB.addMBB(DispatchBB);
37507 else
37508 MIB.addReg(VR);
37509}
37510
37512X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37513 MachineBasicBlock *BB) const {
37514 const MIMetadata MIMD(MI);
37515 MachineFunction *MF = BB->getParent();
37516 MachineRegisterInfo *MRI = &MF->getRegInfo();
37517 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37518 int FI = MF->getFrameInfo().getFunctionContextIndex();
37519
37520 // Get a mapping of the call site numbers to all of the landing pads they're
37521 // associated with.
37522 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37523 unsigned MaxCSNum = 0;
37524 for (auto &MBB : *MF) {
37525 if (!MBB.isEHPad())
37526 continue;
37527
37528 MCSymbol *Sym = nullptr;
37529 for (const auto &MI : MBB) {
37530 if (MI.isDebugInstr())
37531 continue;
37532
37533 assert(MI.isEHLabel() && "expected EH_LABEL");
37534 Sym = MI.getOperand(0).getMCSymbol();
37535 break;
37536 }
37537
37538 if (!MF->hasCallSiteLandingPad(Sym))
37539 continue;
37540
37541 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37542 CallSiteNumToLPad[CSI].push_back(&MBB);
37543 MaxCSNum = std::max(MaxCSNum, CSI);
37544 }
37545 }
37546
37547 // Get an ordered list of the machine basic blocks for the jump table.
37548 std::vector<MachineBasicBlock *> LPadList;
37549 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37550 LPadList.reserve(CallSiteNumToLPad.size());
37551
37552 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37553 for (auto &LP : CallSiteNumToLPad[CSI]) {
37554 LPadList.push_back(LP);
37555 InvokeBBs.insert_range(LP->predecessors());
37556 }
37557 }
37558
37559 assert(!LPadList.empty() &&
37560 "No landing pad destinations for the dispatch jump table!");
37561
37562 // Create the MBBs for the dispatch code.
37563
37564 // Shove the dispatch's address into the return slot in the function context.
37565 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37566 DispatchBB->setIsEHPad(true);
37567
37568 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37569 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37570 DispatchBB->addSuccessor(TrapBB);
37571
37572 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37573 DispatchBB->addSuccessor(DispContBB);
37574
37575 // Insert MBBs.
37576 MF->push_back(DispatchBB);
37577 MF->push_back(DispContBB);
37578 MF->push_back(TrapBB);
37579
37580 // Insert code into the entry block that creates and registers the function
37581 // context.
37582 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37583
37584 // Create the jump table and associated information
37585 unsigned JTE = getJumpTableEncoding();
37586 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37587 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37588
37589 const X86RegisterInfo &RI = TII->getRegisterInfo();
37590 // Add a register mask with no preserved registers. This results in all
37591 // registers being marked as clobbered.
37592 if (RI.hasBasePointer(*MF)) {
37593 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37594 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37595 MFI->setRestoreBasePointer(MF);
37596
37597 Register FP = RI.getFrameRegister(*MF);
37598 Register BP = RI.getBaseRegister();
37599 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37600 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37603 } else {
37604 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37606 }
37607
37608 // IReg is used as an index in a memory operand and therefore can't be SP
37609 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37610 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37611 Subtarget.is64Bit() ? 8 : 4);
37612 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37613 .addReg(IReg)
37614 .addImm(LPadList.size());
37615 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37616 .addMBB(TrapBB)
37618
37619 if (Subtarget.is64Bit()) {
37620 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37621 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37622
37623 // leaq .LJTI0_0(%rip), BReg
37624 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37625 .addReg(X86::RIP)
37626 .addImm(1)
37627 .addReg(0)
37628 .addJumpTableIndex(MJTI)
37629 .addReg(0);
37630 // movzx IReg64, IReg
37631 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37632 .addImm(0)
37633 .addReg(IReg)
37634 .addImm(X86::sub_32bit);
37635
37636 switch (JTE) {
37638 // jmpq *(BReg,IReg64,8)
37639 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37640 .addReg(BReg)
37641 .addImm(8)
37642 .addReg(IReg64)
37643 .addImm(0)
37644 .addReg(0);
37645 break;
37647 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37648 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37649 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37650
37651 // movl (BReg,IReg64,4), OReg
37652 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37653 .addReg(BReg)
37654 .addImm(4)
37655 .addReg(IReg64)
37656 .addImm(0)
37657 .addReg(0);
37658 // movsx OReg64, OReg
37659 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37660 .addReg(OReg);
37661 // addq BReg, OReg64, TReg
37662 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37663 .addReg(OReg64)
37664 .addReg(BReg);
37665 // jmpq *TReg
37666 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37667 break;
37668 }
37669 default:
37670 llvm_unreachable("Unexpected jump table encoding");
37671 }
37672 } else {
37673 // jmpl *.LJTI0_0(,IReg,4)
37674 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37675 .addReg(0)
37676 .addImm(4)
37677 .addReg(IReg)
37678 .addJumpTableIndex(MJTI)
37679 .addReg(0);
37680 }
37681
37682 // Add the jump table entries as successors to the MBB.
37683 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37684 for (auto &LP : LPadList)
37685 if (SeenMBBs.insert(LP).second)
37686 DispContBB->addSuccessor(LP);
37687
37688 // N.B. the order the invoke BBs are processed in doesn't matter here.
37690 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37691 for (MachineBasicBlock *MBB : InvokeBBs) {
37692 // Remove the landing pad successor from the invoke block and replace it
37693 // with the new dispatch block.
37694 // Keep a copy of Successors since it's modified inside the loop.
37695 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37696 MBB->succ_rend());
37697 // FIXME: Avoid quadratic complexity.
37698 for (auto *MBBS : Successors) {
37699 if (MBBS->isEHPad()) {
37700 MBB->removeSuccessor(MBBS);
37701 MBBLPads.push_back(MBBS);
37702 }
37703 }
37704
37705 MBB->addSuccessor(DispatchBB);
37706
37707 // Find the invoke call and mark all of the callee-saved registers as
37708 // 'implicit defined' so that they're spilled. This prevents code from
37709 // moving instructions to before the EH block, where they will never be
37710 // executed.
37711 for (auto &II : reverse(*MBB)) {
37712 if (!II.isCall())
37713 continue;
37714
37715 DenseSet<Register> DefRegs;
37716 for (auto &MOp : II.operands())
37717 if (MOp.isReg())
37718 DefRegs.insert(MOp.getReg());
37719
37720 MachineInstrBuilder MIB(*MF, &II);
37721 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37722 Register Reg = SavedRegs[RegIdx];
37723 if (!DefRegs.contains(Reg))
37725 }
37726
37727 break;
37728 }
37729 }
37730
37731 // Mark all former landing pads as non-landing pads. The dispatch is the only
37732 // landing pad now.
37733 for (auto &LP : MBBLPads)
37734 LP->setIsEHPad(false);
37735
37736 // The instruction is gone now.
37737 MI.eraseFromParent();
37738 return BB;
37739}
37740
37742X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37743 MachineBasicBlock *BB) const {
37744 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37745 // calls may require proper stack alignment.
37746 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37747 const MIMetadata MIMD(MI);
37748 MachineFunction &MF = *BB->getParent();
37749
37750 // Emit CALLSEQ_START right before the instruction.
37751 MF.getFrameInfo().setAdjustsStack(true);
37752 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37753 MachineInstrBuilder CallseqStart =
37754 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37755 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37756
37757 // Emit CALLSEQ_END right after the instruction.
37758 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37759 MachineInstrBuilder CallseqEnd =
37760 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37761 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37762
37763 return BB;
37764}
37765
37768 MachineBasicBlock *BB) const {
37769 MachineFunction *MF = BB->getParent();
37770 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37771 const MIMetadata MIMD(MI);
37772
37773 auto TMMImmToTMMReg = [](unsigned Imm) {
37774 assert (Imm < 8 && "Illegal tmm index");
37775 return X86::TMM0 + Imm;
37776 };
37777 auto TMMImmToTMMPair = [](unsigned Imm) {
37778 assert(Imm < 8 && "Illegal tmm pair index.");
37779 return X86::TMM0_TMM1 + Imm / 2;
37780 };
37781 switch (MI.getOpcode()) {
37782 default:
37783 llvm_unreachable("Unexpected instr type to insert");
37784 case X86::INDIRECT_THUNK_CALL32:
37785 case X86::INDIRECT_THUNK_CALL64:
37786 case X86::INDIRECT_THUNK_TCRETURN32:
37787 case X86::INDIRECT_THUNK_TCRETURN64:
37788 return EmitLoweredIndirectThunk(MI, BB);
37789 case X86::CATCHRET:
37790 return EmitLoweredCatchRet(MI, BB);
37791 case X86::SEG_ALLOCA_32:
37792 case X86::SEG_ALLOCA_64:
37793 return EmitLoweredSegAlloca(MI, BB);
37794 case X86::PROBED_ALLOCA_32:
37795 case X86::PROBED_ALLOCA_64:
37796 return EmitLoweredProbedAlloca(MI, BB);
37797 case X86::TLSCall_32:
37798 case X86::TLSCall_64:
37799 return EmitLoweredTLSCall(MI, BB);
37800 case X86::CMOV_FR16:
37801 case X86::CMOV_FR16X:
37802 case X86::CMOV_FR32:
37803 case X86::CMOV_FR32X:
37804 case X86::CMOV_FR64:
37805 case X86::CMOV_FR64X:
37806 case X86::CMOV_GR8:
37807 case X86::CMOV_GR16:
37808 case X86::CMOV_GR32:
37809 case X86::CMOV_RFP32:
37810 case X86::CMOV_RFP64:
37811 case X86::CMOV_RFP80:
37812 case X86::CMOV_VR64:
37813 case X86::CMOV_VR128:
37814 case X86::CMOV_VR128X:
37815 case X86::CMOV_VR256:
37816 case X86::CMOV_VR256X:
37817 case X86::CMOV_VR512:
37818 case X86::CMOV_VK1:
37819 case X86::CMOV_VK2:
37820 case X86::CMOV_VK4:
37821 case X86::CMOV_VK8:
37822 case X86::CMOV_VK16:
37823 case X86::CMOV_VK32:
37824 case X86::CMOV_VK64:
37825 return EmitLoweredSelect(MI, BB);
37826
37827 case X86::FP80_ADDr:
37828 case X86::FP80_ADDm32: {
37829 // Change the floating point control register to use double extended
37830 // precision when performing the addition.
37831 int OrigCWFrameIdx =
37832 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37833 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37834 OrigCWFrameIdx);
37835
37836 // Load the old value of the control word...
37837 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37838 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37839 OrigCWFrameIdx);
37840
37841 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37842 // precision.
37843 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37844 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37845 .addReg(OldCW, RegState::Kill)
37846 .addImm(0x300);
37847
37848 // Extract to 16 bits.
37849 Register NewCW16 =
37850 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37851 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37852 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37853
37854 // Prepare memory for FLDCW.
37855 int NewCWFrameIdx =
37856 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37857 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37858 NewCWFrameIdx)
37859 .addReg(NewCW16, RegState::Kill);
37860
37861 // Reload the modified control word now...
37862 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37863 NewCWFrameIdx);
37864
37865 // Do the addition.
37866 if (MI.getOpcode() == X86::FP80_ADDr) {
37867 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37868 .add(MI.getOperand(0))
37869 .add(MI.getOperand(1))
37870 .add(MI.getOperand(2));
37871 } else {
37872 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37873 .add(MI.getOperand(0))
37874 .add(MI.getOperand(1))
37875 .add(MI.getOperand(2))
37876 .add(MI.getOperand(3))
37877 .add(MI.getOperand(4))
37878 .add(MI.getOperand(5))
37879 .add(MI.getOperand(6));
37880 }
37881
37882 // Reload the original control word now.
37883 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37884 OrigCWFrameIdx);
37885
37886 MI.eraseFromParent(); // The pseudo instruction is gone now.
37887 return BB;
37888 }
37889
37890 case X86::FP32_TO_INT16_IN_MEM:
37891 case X86::FP32_TO_INT32_IN_MEM:
37892 case X86::FP32_TO_INT64_IN_MEM:
37893 case X86::FP64_TO_INT16_IN_MEM:
37894 case X86::FP64_TO_INT32_IN_MEM:
37895 case X86::FP64_TO_INT64_IN_MEM:
37896 case X86::FP80_TO_INT16_IN_MEM:
37897 case X86::FP80_TO_INT32_IN_MEM:
37898 case X86::FP80_TO_INT64_IN_MEM: {
37899 // Change the floating point control register to use "round towards zero"
37900 // mode when truncating to an integer value.
37901 int OrigCWFrameIdx =
37902 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37903 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37904 OrigCWFrameIdx);
37905
37906 // Load the old value of the control word...
37907 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37908 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37909 OrigCWFrameIdx);
37910
37911 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37912 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37913 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37914 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37915
37916 // Extract to 16 bits.
37917 Register NewCW16 =
37918 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37919 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37920 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37921
37922 // Prepare memory for FLDCW.
37923 int NewCWFrameIdx =
37924 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37925 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37926 NewCWFrameIdx)
37927 .addReg(NewCW16, RegState::Kill);
37928
37929 // Reload the modified control word now...
37930 addFrameReference(BuildMI(*BB, MI, MIMD,
37931 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37932
37933 // Get the X86 opcode to use.
37934 unsigned Opc;
37935 switch (MI.getOpcode()) {
37936 // clang-format off
37937 default: llvm_unreachable("illegal opcode!");
37938 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37939 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37940 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37941 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37942 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37943 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37944 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37945 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37946 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37947 // clang-format on
37948 }
37949
37951 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37952 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37953
37954 // Reload the original control word now.
37955 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37956 OrigCWFrameIdx);
37957
37958 MI.eraseFromParent(); // The pseudo instruction is gone now.
37959 return BB;
37960 }
37961
37962 // xbegin
37963 case X86::XBEGIN:
37964 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37965
37966 case X86::VAARG_64:
37967 case X86::VAARG_X32:
37968 return EmitVAARGWithCustomInserter(MI, BB);
37969
37970 case X86::EH_SjLj_SetJmp32:
37971 case X86::EH_SjLj_SetJmp64:
37972 return emitEHSjLjSetJmp(MI, BB);
37973
37974 case X86::EH_SjLj_LongJmp32:
37975 case X86::EH_SjLj_LongJmp64:
37976 return emitEHSjLjLongJmp(MI, BB);
37977
37978 case X86::Int_eh_sjlj_setup_dispatch:
37979 return EmitSjLjDispatchBlock(MI, BB);
37980
37981 case TargetOpcode::STATEPOINT:
37982 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37983 // this point in the process. We diverge later.
37984 return emitPatchPoint(MI, BB);
37985
37986 case TargetOpcode::STACKMAP:
37987 case TargetOpcode::PATCHPOINT:
37988 return emitPatchPoint(MI, BB);
37989
37990 case TargetOpcode::PATCHABLE_EVENT_CALL:
37991 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37992 return emitPatchableEventCall(MI, BB);
37993
37994 case X86::LCMPXCHG8B: {
37995 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37996 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37997 // requires a memory operand. If it happens that current architecture is
37998 // i686 and for current function we need a base pointer
37999 // - which is ESI for i686 - register allocator would not be able to
38000 // allocate registers for an address in form of X(%reg, %reg, Y)
38001 // - there never would be enough unreserved registers during regalloc
38002 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38003 // We are giving a hand to register allocator by precomputing the address in
38004 // a new vreg using LEA.
38005
38006 // If it is not i686 or there is no base pointer - nothing to do here.
38007 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38008 return BB;
38009
38010 // Even though this code does not necessarily needs the base pointer to
38011 // be ESI, we check for that. The reason: if this assert fails, there are
38012 // some changes happened in the compiler base pointer handling, which most
38013 // probably have to be addressed somehow here.
38014 assert(TRI->getBaseRegister() == X86::ESI &&
38015 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38016 "base pointer in mind");
38017
38019 MVT SPTy = getPointerTy(MF->getDataLayout());
38020 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38021 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38022
38024 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38025 // does not use index register.
38026 if (AM.IndexReg == X86::NoRegister)
38027 return BB;
38028
38029 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38030 // four operand definitions that are E[ABCD] registers. We skip them and
38031 // then insert the LEA.
38032 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38033 while (RMBBI != BB->rend() &&
38034 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38035 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38036 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38037 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38038 ++RMBBI;
38039 }
38042 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38043
38044 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38045
38046 return BB;
38047 }
38048 case X86::LCMPXCHG16B_NO_RBX: {
38049 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38050 Register BasePtr = TRI->getBaseRegister();
38051 if (TRI->hasBasePointer(*MF) &&
38052 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38053 if (!BB->isLiveIn(BasePtr))
38054 BB->addLiveIn(BasePtr);
38055 // Save RBX into a virtual register.
38056 Register SaveRBX =
38057 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38058 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38059 .addReg(X86::RBX);
38060 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38062 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38063 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38064 MIB.add(MI.getOperand(Idx));
38065 MIB.add(MI.getOperand(X86::AddrNumOperands));
38066 MIB.addReg(SaveRBX);
38067 } else {
38068 // Simple case, just copy the virtual register to RBX.
38069 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38070 .add(MI.getOperand(X86::AddrNumOperands));
38072 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38073 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38074 MIB.add(MI.getOperand(Idx));
38075 }
38076 MI.eraseFromParent();
38077 return BB;
38078 }
38079 case X86::MWAITX: {
38080 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38081 Register BasePtr = TRI->getBaseRegister();
38082 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38083 // If no need to save the base pointer, we generate MWAITXrrr,
38084 // else we generate pseudo MWAITX_SAVE_RBX.
38085 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38086 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38087 .addReg(MI.getOperand(0).getReg());
38088 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38089 .addReg(MI.getOperand(1).getReg());
38090 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38091 .addReg(MI.getOperand(2).getReg());
38092 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38093 MI.eraseFromParent();
38094 } else {
38095 if (!BB->isLiveIn(BasePtr)) {
38096 BB->addLiveIn(BasePtr);
38097 }
38098 // Parameters can be copied into ECX and EAX but not EBX yet.
38099 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38100 .addReg(MI.getOperand(0).getReg());
38101 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38102 .addReg(MI.getOperand(1).getReg());
38103 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38104 // Save RBX into a virtual register.
38105 Register SaveRBX =
38106 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38107 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38108 .addReg(X86::RBX);
38109 // Generate mwaitx pseudo.
38110 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38111 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38112 .addDef(Dst) // Destination tied in with SaveRBX.
38113 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38114 .addUse(SaveRBX); // Save of base pointer.
38115 MI.eraseFromParent();
38116 }
38117 return BB;
38118 }
38119 case TargetOpcode::PREALLOCATED_SETUP: {
38120 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38121 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38122 MFI->setHasPreallocatedCall(true);
38123 int64_t PreallocatedId = MI.getOperand(0).getImm();
38124 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38125 assert(StackAdjustment != 0 && "0 stack adjustment");
38126 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38127 << StackAdjustment << "\n");
38128 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38129 .addReg(X86::ESP)
38130 .addImm(StackAdjustment);
38131 MI.eraseFromParent();
38132 return BB;
38133 }
38134 case TargetOpcode::PREALLOCATED_ARG: {
38135 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38136 int64_t PreallocatedId = MI.getOperand(1).getImm();
38137 int64_t ArgIdx = MI.getOperand(2).getImm();
38138 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38139 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38140 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38141 << ", arg offset " << ArgOffset << "\n");
38142 // stack pointer + offset
38143 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38144 MI.getOperand(0).getReg()),
38145 X86::ESP, false, ArgOffset);
38146 MI.eraseFromParent();
38147 return BB;
38148 }
38149 case X86::PTDPBSSD:
38150 case X86::PTDPBSUD:
38151 case X86::PTDPBUSD:
38152 case X86::PTDPBUUD:
38153 case X86::PTDPBF16PS:
38154 case X86::PTDPFP16PS:
38155 case X86::PTCMMIMFP16PS:
38156 case X86::PTCMMRLFP16PS:
38157 case X86::PTDPBF8PS:
38158 case X86::PTDPBHF8PS:
38159 case X86::PTDPHBF8PS:
38160 case X86::PTDPHF8PS:
38161 case X86::PTTDPBF16PS:
38162 case X86::PTTDPFP16PS:
38163 case X86::PTTCMMIMFP16PS:
38164 case X86::PTTCMMRLFP16PS:
38165 case X86::PTCONJTCMMIMFP16PS:
38166 case X86::PTMMULTF32PS:
38167 case X86::PTTMMULTF32PS: {
38168 unsigned Opc;
38169 switch (MI.getOpcode()) {
38170 default: llvm_unreachable("illegal opcode!");
38171 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38172 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38173 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38174 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38175 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38176 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38177 case X86::PTCMMIMFP16PS:
38178 Opc = X86::TCMMIMFP16PS;
38179 break;
38180 case X86::PTCMMRLFP16PS:
38181 Opc = X86::TCMMRLFP16PS;
38182 break;
38183 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38184 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38185 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38186 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38187 case X86::PTTDPBF16PS:
38188 Opc = X86::TTDPBF16PS;
38189 break;
38190 case X86::PTTDPFP16PS:
38191 Opc = X86::TTDPFP16PS;
38192 break;
38193 case X86::PTTCMMIMFP16PS:
38194 Opc = X86::TTCMMIMFP16PS;
38195 break;
38196 case X86::PTTCMMRLFP16PS:
38197 Opc = X86::TTCMMRLFP16PS;
38198 break;
38199 case X86::PTCONJTCMMIMFP16PS:
38200 Opc = X86::TCONJTCMMIMFP16PS;
38201 break;
38202 case X86::PTMMULTF32PS:
38203 Opc = X86::TMMULTF32PS;
38204 break;
38205 case X86::PTTMMULTF32PS:
38206 Opc = X86::TTMMULTF32PS;
38207 break;
38208 }
38209
38210 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38211 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38212 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38213 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38214 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38215
38216 MI.eraseFromParent(); // The pseudo is gone now.
38217 return BB;
38218 }
38219 case X86::PTILEZERO: {
38220 unsigned Imm = MI.getOperand(0).getImm();
38221 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38222 MI.eraseFromParent(); // The pseudo is gone now.
38223 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38225 return BB;
38226 }
38227 case X86::PTILEZEROV: {
38228 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38230 return BB;
38231 }
38232 case X86::PTILELOADDRS:
38233 case X86::PTILELOADDRST1:
38234 case X86::PTILELOADD:
38235 case X86::PTILELOADDT1:
38236 case X86::PTILESTORED: {
38237 unsigned Opc;
38238 switch (MI.getOpcode()) {
38239 default: llvm_unreachable("illegal opcode!");
38240#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38241 case X86::PTILELOADD:
38242 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38243 break;
38244 case X86::PTILELOADDT1:
38245 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38246 break;
38247 case X86::PTILESTORED:
38248 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38249 break;
38250 case X86::PTILELOADDRS:
38251 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38252 break;
38253 case X86::PTILELOADDRST1:
38254 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38255 break;
38256 }
38257#undef GET_EGPR_IF_ENABLED
38258
38259 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38260 unsigned CurOp = 0;
38261 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38262 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38264
38265 MIB.add(MI.getOperand(CurOp++)); // base
38266 MIB.add(MI.getOperand(CurOp++)); // scale
38267 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38268 MIB.add(MI.getOperand(CurOp++)); // displacement
38269 MIB.add(MI.getOperand(CurOp++)); // segment
38270
38271 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38272 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38274
38275 MI.eraseFromParent(); // The pseudo is gone now.
38276 return BB;
38277 }
38278 case X86::PT2RPNTLVWZ0:
38279 case X86::PT2RPNTLVWZ0T1:
38280 case X86::PT2RPNTLVWZ1:
38281 case X86::PT2RPNTLVWZ1T1:
38282 case X86::PT2RPNTLVWZ0RS:
38283 case X86::PT2RPNTLVWZ0RST1:
38284 case X86::PT2RPNTLVWZ1RS:
38285 case X86::PT2RPNTLVWZ1RST1: {
38286 const DebugLoc &DL = MI.getDebugLoc();
38287 unsigned Opc;
38288#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38289 switch (MI.getOpcode()) {
38290 default:
38291 llvm_unreachable("Unexpected instruction!");
38292 case X86::PT2RPNTLVWZ0:
38293 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38294 break;
38295 case X86::PT2RPNTLVWZ0T1:
38296 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38297 break;
38298 case X86::PT2RPNTLVWZ1:
38299 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38300 break;
38301 case X86::PT2RPNTLVWZ1T1:
38302 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38303 break;
38304 case X86::PT2RPNTLVWZ0RS:
38305 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38306 break;
38307 case X86::PT2RPNTLVWZ0RST1:
38308 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38309 break;
38310 case X86::PT2RPNTLVWZ1RS:
38311 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38312 break;
38313 case X86::PT2RPNTLVWZ1RST1:
38314 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38315 break;
38316 }
38317#undef GET_EGPR_IF_ENABLED
38318 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38319 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38320
38321 MIB.add(MI.getOperand(1)); // base
38322 MIB.add(MI.getOperand(2)); // scale
38323 MIB.add(MI.getOperand(3)); // index
38324 MIB.add(MI.getOperand(4)); // displacement
38325 MIB.add(MI.getOperand(5)); // segment
38326 MI.eraseFromParent(); // The pseudo is gone now.
38327 return BB;
38328 }
38329 case X86::PTTRANSPOSED:
38330 case X86::PTCONJTFP16: {
38331 const DebugLoc &DL = MI.getDebugLoc();
38332 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38333 : X86::TCONJTFP16;
38334
38335 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38336 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38337 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38338
38339 MI.eraseFromParent(); // The pseudo is gone now.
38340 return BB;
38341 }
38342 case X86::PTCVTROWPS2BF16Hrri:
38343 case X86::PTCVTROWPS2BF16Lrri:
38344 case X86::PTCVTROWPS2PHHrri:
38345 case X86::PTCVTROWPS2PHLrri:
38346 case X86::PTCVTROWD2PSrri:
38347 case X86::PTILEMOVROWrri: {
38348 const DebugLoc &DL = MI.getDebugLoc();
38349 unsigned Opc;
38350 switch (MI.getOpcode()) {
38351 default:
38352 llvm_unreachable("Unexpected instruction!");
38353 case X86::PTCVTROWD2PSrri:
38354 Opc = X86::TCVTROWD2PSrri;
38355 break;
38356 case X86::PTCVTROWPS2BF16Hrri:
38357 Opc = X86::TCVTROWPS2BF16Hrri;
38358 break;
38359 case X86::PTCVTROWPS2PHHrri:
38360 Opc = X86::TCVTROWPS2PHHrri;
38361 break;
38362 case X86::PTCVTROWPS2BF16Lrri:
38363 Opc = X86::TCVTROWPS2BF16Lrri;
38364 break;
38365 case X86::PTCVTROWPS2PHLrri:
38366 Opc = X86::TCVTROWPS2PHLrri;
38367 break;
38368 case X86::PTILEMOVROWrri:
38369 Opc = X86::TILEMOVROWrri;
38370 break;
38371 }
38372 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38373 MIB.add(MI.getOperand(0));
38374 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38375 MIB.addImm(MI.getOperand(2).getImm());
38376
38377 MI.eraseFromParent(); // The pseudo is gone now.
38378 return BB;
38379 }
38380 case X86::PTCVTROWPS2BF16Hrre:
38381 case X86::PTCVTROWPS2BF16Lrre:
38382 case X86::PTCVTROWPS2PHHrre:
38383 case X86::PTCVTROWPS2PHLrre:
38384 case X86::PTCVTROWD2PSrre:
38385 case X86::PTILEMOVROWrre: {
38386 const DebugLoc &DL = MI.getDebugLoc();
38387 unsigned Opc;
38388 switch (MI.getOpcode()) {
38389 default:
38390 llvm_unreachable("Unexpected instruction!");
38391 case X86::PTCVTROWD2PSrre:
38392 Opc = X86::TCVTROWD2PSrre;
38393 break;
38394 case X86::PTCVTROWPS2BF16Hrre:
38395 Opc = X86::TCVTROWPS2BF16Hrre;
38396 break;
38397 case X86::PTCVTROWPS2BF16Lrre:
38398 Opc = X86::TCVTROWPS2BF16Lrre;
38399 break;
38400 case X86::PTCVTROWPS2PHHrre:
38401 Opc = X86::TCVTROWPS2PHHrre;
38402 break;
38403 case X86::PTCVTROWPS2PHLrre:
38404 Opc = X86::TCVTROWPS2PHLrre;
38405 break;
38406 case X86::PTILEMOVROWrre:
38407 Opc = X86::TILEMOVROWrre;
38408 break;
38409 }
38410 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38411 MIB.add(MI.getOperand(0));
38412 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38413 MIB.add(MI.getOperand(2));
38414
38415 MI.eraseFromParent(); // The pseudo is gone now.
38416 return BB;
38417 }
38418 }
38419}
38420
38421//===----------------------------------------------------------------------===//
38422// X86 Optimization Hooks
38423//===----------------------------------------------------------------------===//
38424
38425bool
38427 const APInt &DemandedBits,
38428 const APInt &DemandedElts,
38429 TargetLoweringOpt &TLO) const {
38430 EVT VT = Op.getValueType();
38431 unsigned Opcode = Op.getOpcode();
38432 unsigned EltSize = VT.getScalarSizeInBits();
38433
38434 if (VT.isVector()) {
38435 // If the constant is only all signbits in the active bits, then we should
38436 // extend it to the entire constant to allow it act as a boolean constant
38437 // vector.
38438 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38439 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38440 return false;
38441 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38442 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38443 continue;
38444 const APInt &Val = V.getConstantOperandAPInt(i);
38445 if (Val.getBitWidth() > Val.getNumSignBits() &&
38446 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38447 return true;
38448 }
38449 return false;
38450 };
38451 // For vectors - if we have a constant, then try to sign extend.
38452 // TODO: Handle AND cases.
38453 unsigned ActiveBits = DemandedBits.getActiveBits();
38454 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38455 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38456 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38457 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38458 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38460 SDValue NewC =
38462 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38463 SDValue NewOp =
38464 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38465 return TLO.CombineTo(Op, NewOp);
38466 }
38467 return false;
38468 }
38469
38470 // Only optimize Ands to prevent shrinking a constant that could be
38471 // matched by movzx.
38472 if (Opcode != ISD::AND)
38473 return false;
38474
38475 // Make sure the RHS really is a constant.
38476 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38477 if (!C)
38478 return false;
38479
38480 const APInt &Mask = C->getAPIntValue();
38481
38482 // Clear all non-demanded bits initially.
38483 APInt ShrunkMask = Mask & DemandedBits;
38484
38485 // Find the width of the shrunk mask.
38486 unsigned Width = ShrunkMask.getActiveBits();
38487
38488 // If the mask is all 0s there's nothing to do here.
38489 if (Width == 0)
38490 return false;
38491
38492 // Find the next power of 2 width, rounding up to a byte.
38493 Width = llvm::bit_ceil(std::max(Width, 8U));
38494 // Truncate the width to size to handle illegal types.
38495 Width = std::min(Width, EltSize);
38496
38497 // Calculate a possible zero extend mask for this constant.
38498 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38499
38500 // If we aren't changing the mask, just return true to keep it and prevent
38501 // the caller from optimizing.
38502 if (ZeroExtendMask == Mask)
38503 return true;
38504
38505 // Make sure the new mask can be represented by a combination of mask bits
38506 // and non-demanded bits.
38507 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38508 return false;
38509
38510 // Replace the constant with the zero extend mask.
38511 SDLoc DL(Op);
38512 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38513 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38514 return TLO.CombineTo(Op, NewOp);
38515}
38516
38518 KnownBits &Known,
38519 const APInt &DemandedElts,
38520 const SelectionDAG &DAG, unsigned Depth) {
38521 KnownBits Known2;
38522 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38523 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38524 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38525 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38526 Known = KnownBits::abdu(Known, Known2).zext(16);
38527 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38528 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38529 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38530 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38531 Known = Known.zext(64);
38532}
38533
38535 KnownBits &Known,
38536 const APInt &DemandedElts,
38537 const SelectionDAG &DAG,
38538 unsigned Depth) {
38539 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38540
38541 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38542 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38543 APInt DemandedLoElts =
38544 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38545 APInt DemandedHiElts =
38546 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38547 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38548 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38549 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38550 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38551 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38552 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38553 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38554}
38555
38557 KnownBits &Known,
38558 const APInt &DemandedElts,
38559 const SelectionDAG &DAG,
38560 unsigned Depth) {
38561 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38562
38563 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38564 // pairs.
38565 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38566 APInt DemandedLoElts =
38567 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38568 APInt DemandedHiElts =
38569 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38570 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38571 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38572 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38573 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38574 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38575 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38576 Known = KnownBits::sadd_sat(Lo, Hi);
38577}
38578
38580 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38581 const SelectionDAG &DAG,
38582 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38583 KnownBitsFunc) {
38584 APInt DemandedEltsLHS, DemandedEltsRHS;
38585 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38586 DemandedElts, DemandedEltsLHS,
38587 DemandedEltsRHS);
38588
38589 const auto ComputeForSingleOpFunc =
38590 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38591 return KnownBitsFunc(
38592 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38593 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38594 };
38595
38596 if (DemandedEltsRHS.isZero())
38597 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38598 if (DemandedEltsLHS.isZero())
38599 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38600
38601 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38602 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38603}
38604
38606 KnownBits &Known,
38607 const APInt &DemandedElts,
38608 const SelectionDAG &DAG,
38609 unsigned Depth) const {
38610 unsigned BitWidth = Known.getBitWidth();
38611 unsigned NumElts = DemandedElts.getBitWidth();
38612 unsigned Opc = Op.getOpcode();
38613 EVT VT = Op.getValueType();
38618 "Should use MaskedValueIsZero if you don't know whether Op"
38619 " is a target node!");
38620
38621 Known.resetAll();
38622 switch (Opc) {
38623 default: break;
38624 case X86ISD::MUL_IMM: {
38625 KnownBits Known2;
38626 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38627 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38628 Known = KnownBits::mul(Known, Known2);
38629 break;
38630 }
38631 case X86ISD::BSF: {
38633
38634 KnownBits Known2;
38635 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38636 if (Known2.isNonZero()) {
38637 // If we have a known 1, its position is our upper bound.
38638 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38639 unsigned LowBits = llvm::bit_width(PossibleTZ);
38640 Known.Zero.setBitsFrom(LowBits);
38641 } else if (!Op.getOperand(0).isUndef()) {
38642 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38643 Known = Known.intersectWith(Known2);
38644 }
38645 break;
38646 }
38647 case X86ISD::BSR: {
38648 // TODO: Bound with input known bits?
38650
38651 if (!Op.getOperand(0).isUndef() &&
38652 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38653 KnownBits Known2;
38654 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38655 Known = Known.intersectWith(Known2);
38656 }
38657 break;
38658 }
38659 case X86ISD::SETCC:
38660 Known.Zero.setBitsFrom(1);
38661 break;
38662 case X86ISD::MOVMSK: {
38663 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38664 Known.Zero.setBitsFrom(NumLoBits);
38665 break;
38666 }
38667 case X86ISD::PEXTRB:
38668 case X86ISD::PEXTRW: {
38669 SDValue Src = Op.getOperand(0);
38670 EVT SrcVT = Src.getValueType();
38671 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38672 Op.getConstantOperandVal(1));
38673 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38674 Known = Known.anyextOrTrunc(BitWidth);
38675 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38676 break;
38677 }
38678 case X86ISD::VSRAI:
38679 case X86ISD::VSHLI:
38680 case X86ISD::VSRLI: {
38681 unsigned ShAmt = Op.getConstantOperandVal(1);
38682 if (ShAmt >= VT.getScalarSizeInBits()) {
38683 // Out of range logical bit shifts are guaranteed to be zero.
38684 // Out of range arithmetic bit shifts splat the sign bit.
38685 if (Opc != X86ISD::VSRAI) {
38686 Known.setAllZero();
38687 break;
38688 }
38689
38690 ShAmt = VT.getScalarSizeInBits() - 1;
38691 }
38692
38693 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38694 if (Opc == X86ISD::VSHLI) {
38695 Known <<= ShAmt;
38696 // Low bits are known zero.
38697 Known.Zero.setLowBits(ShAmt);
38698 } else if (Opc == X86ISD::VSRLI) {
38699 Known >>= ShAmt;
38700 // High bits are known zero.
38701 Known.Zero.setHighBits(ShAmt);
38702 } else {
38703 Known.Zero.ashrInPlace(ShAmt);
38704 Known.One.ashrInPlace(ShAmt);
38705 }
38706 break;
38707 }
38708 case X86ISD::PACKUS: {
38709 // PACKUS is just a truncation if the upper half is zero.
38710 APInt DemandedLHS, DemandedRHS;
38711 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38712
38713 Known.One = APInt::getAllOnes(BitWidth * 2);
38714 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38715
38716 KnownBits Known2;
38717 if (!!DemandedLHS) {
38718 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38719 Known = Known.intersectWith(Known2);
38720 }
38721 if (!!DemandedRHS) {
38722 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38723 Known = Known.intersectWith(Known2);
38724 }
38725
38726 if (Known.countMinLeadingZeros() < BitWidth)
38727 Known.resetAll();
38728 Known = Known.trunc(BitWidth);
38729 break;
38730 }
38731 case X86ISD::PSHUFB: {
38732 SDValue Src = Op.getOperand(0);
38733 SDValue Idx = Op.getOperand(1);
38734
38735 // If the index vector is never negative (MSB is zero), then all elements
38736 // come from the source vector. This is useful for cases where
38737 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38738 // below will handle the more common constant shuffle mask case.
38739 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38740 if (KnownIdx.isNonNegative())
38741 Known = DAG.computeKnownBits(Src, Depth + 1);
38742 break;
38743 }
38744 case X86ISD::VBROADCAST: {
38745 SDValue Src = Op.getOperand(0);
38746 if (!Src.getSimpleValueType().isVector()) {
38747 Known = DAG.computeKnownBits(Src, Depth + 1);
38748 return;
38749 }
38750 break;
38751 }
38752 case X86ISD::AND: {
38753 if (Op.getResNo() == 0) {
38754 KnownBits Known2;
38755 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38756 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38757 Known &= Known2;
38758 }
38759 break;
38760 }
38761 case X86ISD::ANDNP: {
38762 KnownBits Known2;
38763 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38764 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38765
38766 // ANDNP = (~X & Y);
38767 Known.One &= Known2.Zero;
38768 Known.Zero |= Known2.One;
38769 break;
38770 }
38771 case X86ISD::FOR: {
38772 KnownBits Known2;
38773 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38774 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38775
38776 Known |= Known2;
38777 break;
38778 }
38779 case X86ISD::PSADBW: {
38780 SDValue LHS = Op.getOperand(0);
38781 SDValue RHS = Op.getOperand(1);
38782 assert(VT.getScalarType() == MVT::i64 &&
38783 LHS.getValueType() == RHS.getValueType() &&
38784 LHS.getValueType().getScalarType() == MVT::i8 &&
38785 "Unexpected PSADBW types");
38786 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38787 break;
38788 }
38789 case X86ISD::PCMPGT:
38790 case X86ISD::PCMPEQ: {
38791 KnownBits KnownLhs =
38792 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38793 KnownBits KnownRhs =
38794 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38795 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38796 ? KnownBits::eq(KnownLhs, KnownRhs)
38797 : KnownBits::sgt(KnownLhs, KnownRhs);
38798 if (Res) {
38799 if (*Res)
38800 Known.setAllOnes();
38801 else
38802 Known.setAllZero();
38803 }
38804 break;
38805 }
38806 case X86ISD::VPMADDWD: {
38807 SDValue LHS = Op.getOperand(0);
38808 SDValue RHS = Op.getOperand(1);
38809 assert(VT.getVectorElementType() == MVT::i32 &&
38810 LHS.getValueType() == RHS.getValueType() &&
38811 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38812 "Unexpected PMADDWD types");
38813 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38814 break;
38815 }
38816 case X86ISD::VPMADDUBSW: {
38817 SDValue LHS = Op.getOperand(0);
38818 SDValue RHS = Op.getOperand(1);
38819 assert(VT.getVectorElementType() == MVT::i16 &&
38820 LHS.getValueType() == RHS.getValueType() &&
38821 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38822 "Unexpected PMADDUBSW types");
38823 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38824 break;
38825 }
38826 case X86ISD::PMULUDQ: {
38827 KnownBits Known2;
38828 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38829 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38830
38831 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38832 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38833 Known = KnownBits::mul(Known, Known2);
38834 break;
38835 }
38836 case X86ISD::CMOV: {
38837 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38838 // If we don't know any bits, early out.
38839 if (Known.isUnknown())
38840 break;
38841 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38842
38843 // Only known if known in both the LHS and RHS.
38844 Known = Known.intersectWith(Known2);
38845 break;
38846 }
38847 case X86ISD::BEXTR:
38848 case X86ISD::BEXTRI: {
38849 SDValue Op0 = Op.getOperand(0);
38850 SDValue Op1 = Op.getOperand(1);
38851
38852 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38853 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38854 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38855
38856 // If the length is 0, the result is 0.
38857 if (Length == 0) {
38858 Known.setAllZero();
38859 break;
38860 }
38861
38862 if ((Shift + Length) <= BitWidth) {
38863 Known = DAG.computeKnownBits(Op0, Depth + 1);
38864 Known = Known.extractBits(Length, Shift);
38865 Known = Known.zextOrTrunc(BitWidth);
38866 }
38867 }
38868 break;
38869 }
38870 case X86ISD::PDEP: {
38871 KnownBits Known2;
38872 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38873 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38874 // Zeros are retained from the mask operand. But not ones.
38875 Known.One.clearAllBits();
38876 // The result will have at least as many trailing zeros as the non-mask
38877 // operand since bits can only map to the same or higher bit position.
38878 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38879 break;
38880 }
38881 case X86ISD::PEXT: {
38882 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38883 // The result has as many leading zeros as the number of zeroes in the mask.
38884 unsigned Count = Known.Zero.popcount();
38886 Known.One.clearAllBits();
38887 break;
38888 }
38889 case X86ISD::VTRUNC:
38890 case X86ISD::VTRUNCS:
38891 case X86ISD::VTRUNCUS:
38892 case X86ISD::CVTSI2P:
38893 case X86ISD::CVTUI2P:
38894 case X86ISD::CVTP2SI:
38895 case X86ISD::CVTP2UI:
38896 case X86ISD::MCVTP2SI:
38897 case X86ISD::MCVTP2UI:
38898 case X86ISD::CVTTP2SI:
38899 case X86ISD::CVTTP2UI:
38900 case X86ISD::MCVTTP2SI:
38901 case X86ISD::MCVTTP2UI:
38902 case X86ISD::MCVTSI2P:
38903 case X86ISD::MCVTUI2P:
38904 case X86ISD::VFPROUND:
38905 case X86ISD::VMFPROUND:
38906 case X86ISD::CVTPS2PH:
38907 case X86ISD::MCVTPS2PH:
38908 case X86ISD::MCVTTP2SIS:
38909 case X86ISD::MCVTTP2UIS: {
38910 // Truncations/Conversions - upper elements are known zero.
38911 EVT SrcVT = Op.getOperand(0).getValueType();
38912 if (SrcVT.isVector()) {
38913 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38914 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38915 Known.setAllZero();
38916 }
38917 break;
38918 }
38925 // Strict Conversions - upper elements are known zero.
38926 EVT SrcVT = Op.getOperand(1).getValueType();
38927 if (SrcVT.isVector()) {
38928 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38929 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38930 Known.setAllZero();
38931 }
38932 break;
38933 }
38934 case X86ISD::MOVQ2DQ: {
38935 // Move from MMX to XMM. Upper half of XMM should be 0.
38936 if (DemandedElts.countr_zero() >= (NumElts / 2))
38937 Known.setAllZero();
38938 break;
38939 }
38941 APInt UndefElts;
38942 SmallVector<APInt, 16> EltBits;
38943 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38944 /*AllowWholeUndefs*/ false,
38945 /*AllowPartialUndefs*/ false)) {
38946 Known.Zero.setAllBits();
38947 Known.One.setAllBits();
38948 for (unsigned I = 0; I != NumElts; ++I) {
38949 if (!DemandedElts[I])
38950 continue;
38951 if (UndefElts[I]) {
38952 Known.resetAll();
38953 break;
38954 }
38955 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38956 Known = Known.intersectWith(Known2);
38957 }
38958 return;
38959 }
38960 break;
38961 }
38962 case X86ISD::HADD:
38963 case X86ISD::HSUB: {
38965 Op, DemandedElts, Depth, DAG,
38966 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38968 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38969 KnownLHS, KnownRHS);
38970 });
38971 break;
38972 }
38974 switch (Op->getConstantOperandVal(0)) {
38975 case Intrinsic::x86_sse2_pmadd_wd:
38976 case Intrinsic::x86_avx2_pmadd_wd:
38977 case Intrinsic::x86_avx512_pmaddw_d_512: {
38978 SDValue LHS = Op.getOperand(1);
38979 SDValue RHS = Op.getOperand(2);
38980 assert(VT.getScalarType() == MVT::i32 &&
38981 LHS.getValueType() == RHS.getValueType() &&
38982 LHS.getValueType().getScalarType() == MVT::i16 &&
38983 "Unexpected PMADDWD types");
38984 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38985 break;
38986 }
38987 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38988 case Intrinsic::x86_avx2_pmadd_ub_sw:
38989 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38990 SDValue LHS = Op.getOperand(1);
38991 SDValue RHS = Op.getOperand(2);
38992 assert(VT.getScalarType() == MVT::i16 &&
38993 LHS.getValueType() == RHS.getValueType() &&
38994 LHS.getValueType().getScalarType() == MVT::i8 &&
38995 "Unexpected PMADDUBSW types");
38996 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38997 break;
38998 }
38999 case Intrinsic::x86_sse2_psad_bw:
39000 case Intrinsic::x86_avx2_psad_bw:
39001 case Intrinsic::x86_avx512_psad_bw_512: {
39002 SDValue LHS = Op.getOperand(1);
39003 SDValue RHS = Op.getOperand(2);
39004 assert(VT.getScalarType() == MVT::i64 &&
39005 LHS.getValueType() == RHS.getValueType() &&
39006 LHS.getValueType().getScalarType() == MVT::i8 &&
39007 "Unexpected PSADBW types");
39008 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
39009 break;
39010 }
39011 }
39012 break;
39013 }
39014 case X86ISD::VPMADD52L:
39015 case X86ISD::VPMADD52H: {
39016 assert(Op.getValueType().isVector() &&
39017 Op.getValueType().getScalarType() == MVT::i64 &&
39018 "Unexpected VPMADD52 type");
39019 KnownBits K0 =
39020 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
39021 KnownBits K1 =
39022 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
39023 KnownBits KAcc =
39024 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
39025 K0 = K0.trunc(52);
39026 K1 = K1.trunc(52);
39027 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
39028 ? KnownBits::mul(K0, K1)
39029 : KnownBits::mulhu(K0, K1);
39030 KnownMul = KnownMul.zext(64);
39031 Known = KnownBits::add(KAcc, KnownMul);
39032 return;
39033 }
39034 }
39035
39036 // Handle target shuffles.
39037 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39038 if (isTargetShuffle(Opc)) {
39041 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39042 unsigned NumOps = Ops.size();
39043 unsigned NumElts = VT.getVectorNumElements();
39044 if (Mask.size() == NumElts) {
39045 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39046 Known.Zero.setAllBits(); Known.One.setAllBits();
39047 for (unsigned i = 0; i != NumElts; ++i) {
39048 if (!DemandedElts[i])
39049 continue;
39050 int M = Mask[i];
39051 if (M == SM_SentinelUndef) {
39052 // For UNDEF elements, we don't know anything about the common state
39053 // of the shuffle result.
39054 Known.resetAll();
39055 break;
39056 }
39057 if (M == SM_SentinelZero) {
39058 Known.One.clearAllBits();
39059 continue;
39060 }
39061 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39062 "Shuffle index out of range");
39063
39064 unsigned OpIdx = (unsigned)M / NumElts;
39065 unsigned EltIdx = (unsigned)M % NumElts;
39066 if (Ops[OpIdx].getValueType() != VT) {
39067 // TODO - handle target shuffle ops with different value types.
39068 Known.resetAll();
39069 break;
39070 }
39071 DemandedOps[OpIdx].setBit(EltIdx);
39072 }
39073 // Known bits are the values that are shared by every demanded element.
39074 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39075 if (!DemandedOps[i])
39076 continue;
39077 KnownBits Known2 =
39078 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39079 Known = Known.intersectWith(Known2);
39080 }
39081 }
39082 }
39083 }
39084}
39085
39087 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39088 unsigned Depth) const {
39089 EVT VT = Op.getValueType();
39090 unsigned VTBits = VT.getScalarSizeInBits();
39091 unsigned Opcode = Op.getOpcode();
39092 switch (Opcode) {
39094 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39095 return VTBits;
39096
39097 case X86ISD::VTRUNC: {
39098 SDValue Src = Op.getOperand(0);
39099 MVT SrcVT = Src.getSimpleValueType();
39100 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39101 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39102 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39103 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39104 if (Tmp > (NumSrcBits - VTBits))
39105 return Tmp - (NumSrcBits - VTBits);
39106 return 1;
39107 }
39108
39109 case X86ISD::PACKSS: {
39110 // PACKSS is just a truncation if the sign bits extend to the packed size.
39111 APInt DemandedLHS, DemandedRHS;
39112 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39113 DemandedRHS);
39114
39115 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39116 // patterns often used to compact vXi64 allsignbit patterns.
39117 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39119 if (BC.getOpcode() == X86ISD::PACKSS &&
39120 BC.getScalarValueSizeInBits() == 16 &&
39121 V.getScalarValueSizeInBits() == 32) {
39124 if (BC0.getScalarValueSizeInBits() == 64 &&
39125 BC1.getScalarValueSizeInBits() == 64 &&
39126 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39127 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39128 return 32;
39129 }
39130 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39131 };
39132
39133 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39134 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39135 if (!!DemandedLHS)
39136 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39137 if (!!DemandedRHS)
39138 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39139 unsigned Tmp = std::min(Tmp0, Tmp1);
39140 if (Tmp > (SrcBits - VTBits))
39141 return Tmp - (SrcBits - VTBits);
39142 return 1;
39143 }
39144
39145 case X86ISD::VBROADCAST: {
39146 SDValue Src = Op.getOperand(0);
39147 if (!Src.getSimpleValueType().isVector())
39148 return DAG.ComputeNumSignBits(Src, Depth + 1);
39149 break;
39150 }
39151
39152 case X86ISD::VSHLI: {
39153 SDValue Src = Op.getOperand(0);
39154 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39155 if (ShiftVal.uge(VTBits))
39156 return VTBits; // Shifted all bits out --> zero.
39157 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39158 if (ShiftVal.uge(Tmp))
39159 return 1; // Shifted all sign bits out --> unknown.
39160 return Tmp - ShiftVal.getZExtValue();
39161 }
39162
39163 case X86ISD::VSRAI: {
39164 SDValue Src = Op.getOperand(0);
39165 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39166 if (ShiftVal.uge(VTBits - 1))
39167 return VTBits; // Sign splat.
39168 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39169 ShiftVal += Tmp;
39170 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39171 }
39172
39173 case X86ISD::FSETCC:
39174 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39175 if (VT == MVT::f32 || VT == MVT::f64 ||
39176 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39177 return VTBits;
39178 break;
39179
39180 case X86ISD::PCMPGT:
39181 case X86ISD::PCMPEQ:
39182 case X86ISD::CMPP:
39183 case X86ISD::VPCOM:
39184 case X86ISD::VPCOMU:
39185 // Vector compares return zero/all-bits result values.
39186 return VTBits;
39187
39188 case X86ISD::ANDNP: {
39189 unsigned Tmp0 =
39190 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39191 if (Tmp0 == 1) return 1; // Early out.
39192 unsigned Tmp1 =
39193 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39194 return std::min(Tmp0, Tmp1);
39195 }
39196
39197 case X86ISD::CMOV: {
39198 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39199 if (Tmp0 == 1) return 1; // Early out.
39200 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39201 return std::min(Tmp0, Tmp1);
39202 }
39203 }
39204
39205 // Handle target shuffles.
39206 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39207 if (isTargetShuffle(Opcode)) {
39210 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39211 unsigned NumOps = Ops.size();
39212 unsigned NumElts = VT.getVectorNumElements();
39213 if (Mask.size() == NumElts) {
39214 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39215 for (unsigned i = 0; i != NumElts; ++i) {
39216 if (!DemandedElts[i])
39217 continue;
39218 int M = Mask[i];
39219 if (M == SM_SentinelUndef) {
39220 // For UNDEF elements, we don't know anything about the common state
39221 // of the shuffle result.
39222 return 1;
39223 } else if (M == SM_SentinelZero) {
39224 // Zero = all sign bits.
39225 continue;
39226 }
39227 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39228 "Shuffle index out of range");
39229
39230 unsigned OpIdx = (unsigned)M / NumElts;
39231 unsigned EltIdx = (unsigned)M % NumElts;
39232 if (Ops[OpIdx].getValueType() != VT) {
39233 // TODO - handle target shuffle ops with different value types.
39234 return 1;
39235 }
39236 DemandedOps[OpIdx].setBit(EltIdx);
39237 }
39238 unsigned Tmp0 = VTBits;
39239 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39240 if (!DemandedOps[i])
39241 continue;
39242 unsigned Tmp1 =
39243 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39244 Tmp0 = std::min(Tmp0, Tmp1);
39245 }
39246 return Tmp0;
39247 }
39248 }
39249 }
39250
39251 // Fallback case.
39252 return 1;
39253}
39254
39256 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39257 return N->getOperand(0);
39258 return N;
39259}
39260
39261// Helper to look for a normal load that can be narrowed into a vzload with the
39262// specified VT and memory VT. Returns SDValue() on failure.
39264 SelectionDAG &DAG) {
39265 // Can't if the load is volatile or atomic.
39266 if (!LN->isSimple())
39267 return SDValue();
39268
39269 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39270 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39271 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39272 LN->getPointerInfo(), LN->getBaseAlign(),
39273 LN->getMemOperand()->getFlags());
39274}
39275
39276// Attempt to match a combined shuffle mask against supported unary shuffle
39277// instructions.
39278// TODO: Investigate sharing more of this with shuffle lowering.
39279static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39280 bool AllowFloatDomain, bool AllowIntDomain,
39281 SDValue V1, const SelectionDAG &DAG,
39282 const X86Subtarget &Subtarget, unsigned &Shuffle,
39283 MVT &SrcVT, MVT &DstVT) {
39284 unsigned NumMaskElts = Mask.size();
39285 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39286
39287 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39288 if (Mask[0] == 0 &&
39289 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39290 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39292 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39293 Shuffle = X86ISD::VZEXT_MOVL;
39294 if (MaskEltSize == 16)
39295 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39296 else
39297 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39298 return true;
39299 }
39300 }
39301
39302 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39303 if (AllowIntDomain &&
39304 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39305 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39306 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39307 unsigned MaxScale = 64 / MaskEltSize;
39308 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39309 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39310 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39311 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39312 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39313 continue;
39314 bool MatchAny = true;
39315 bool MatchZero = true;
39316 bool MatchSign = UseSign;
39317 unsigned NumDstElts = NumMaskElts / Scale;
39318 for (unsigned i = 0;
39319 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39320 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39321 MatchAny = MatchSign = MatchZero = false;
39322 break;
39323 }
39324 unsigned Pos = (i * Scale) + 1;
39325 unsigned Len = Scale - 1;
39326 MatchAny &= isUndefInRange(Mask, Pos, Len);
39327 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39328 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39329 }
39330 if (MatchAny || MatchSign || MatchZero) {
39331 assert((MatchSign || MatchZero) &&
39332 "Failed to match sext/zext but matched aext?");
39333 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39334 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39335 : MVT::getIntegerVT(MaskEltSize);
39336 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39337
39338 Shuffle = unsigned(
39339 MatchAny ? ISD::ANY_EXTEND
39340 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39341 if (SrcVT.getVectorNumElements() != NumDstElts)
39342 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39343
39344 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39345 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39346 return true;
39347 }
39348 }
39349 }
39350
39351 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39352 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39353 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39354 isUndefOrEqual(Mask[0], 0) &&
39355 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39356 Shuffle = X86ISD::VZEXT_MOVL;
39357 if (MaskEltSize == 16)
39358 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39359 else
39360 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39361 return true;
39362 }
39363
39364 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39365 // instructions are no slower than UNPCKLPD but has the option to
39366 // fold the input operand into even an unaligned memory load.
39367 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39368 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39369 Shuffle = X86ISD::MOVDDUP;
39370 SrcVT = DstVT = MVT::v2f64;
39371 return true;
39372 }
39373 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39374 Shuffle = X86ISD::MOVSLDUP;
39375 SrcVT = DstVT = MVT::v4f32;
39376 return true;
39377 }
39378 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39379 Shuffle = X86ISD::MOVSHDUP;
39380 SrcVT = DstVT = MVT::v4f32;
39381 return true;
39382 }
39383 }
39384
39385 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39386 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39387 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39388 Shuffle = X86ISD::MOVDDUP;
39389 SrcVT = DstVT = MVT::v4f64;
39390 return true;
39391 }
39392 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39393 V1)) {
39394 Shuffle = X86ISD::MOVSLDUP;
39395 SrcVT = DstVT = MVT::v8f32;
39396 return true;
39397 }
39398 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39399 V1)) {
39400 Shuffle = X86ISD::MOVSHDUP;
39401 SrcVT = DstVT = MVT::v8f32;
39402 return true;
39403 }
39404 }
39405
39406 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39407 assert(Subtarget.hasAVX512() &&
39408 "AVX512 required for 512-bit vector shuffles");
39409 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39410 V1)) {
39411 Shuffle = X86ISD::MOVDDUP;
39412 SrcVT = DstVT = MVT::v8f64;
39413 return true;
39414 }
39416 MaskVT, Mask,
39417 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39418 Shuffle = X86ISD::MOVSLDUP;
39419 SrcVT = DstVT = MVT::v16f32;
39420 return true;
39421 }
39423 MaskVT, Mask,
39424 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39425 Shuffle = X86ISD::MOVSHDUP;
39426 SrcVT = DstVT = MVT::v16f32;
39427 return true;
39428 }
39429 }
39430
39431 return false;
39432}
39433
39434// Attempt to match a combined shuffle mask against supported unary immediate
39435// permute instructions.
39436// TODO: Investigate sharing more of this with shuffle lowering.
39438 const APInt &Zeroable,
39439 bool AllowFloatDomain, bool AllowIntDomain,
39440 const SelectionDAG &DAG,
39441 const X86Subtarget &Subtarget,
39442 unsigned &Shuffle, MVT &ShuffleVT,
39443 unsigned &PermuteImm) {
39444 unsigned NumMaskElts = Mask.size();
39445 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39446 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39447 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39448 bool ContainsZeros = isAnyZero(Mask);
39449
39450 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39451 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39452 // Check for lane crossing permutes.
39453 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39454 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39455 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39456 Shuffle = X86ISD::VPERMI;
39457 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39458 PermuteImm = getV4X86ShuffleImm(Mask);
39459 return true;
39460 }
39461 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39462 SmallVector<int, 4> RepeatedMask;
39463 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39464 Shuffle = X86ISD::VPERMI;
39465 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39466 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39467 return true;
39468 }
39469 }
39470 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39471 // VPERMILPD can permute with a non-repeating shuffle.
39472 Shuffle = X86ISD::VPERMILPI;
39473 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39474 PermuteImm = 0;
39475 for (int i = 0, e = Mask.size(); i != e; ++i) {
39476 int M = Mask[i];
39477 if (M == SM_SentinelUndef)
39478 continue;
39479 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39480 PermuteImm |= (M & 1) << i;
39481 }
39482 return true;
39483 }
39484 }
39485
39486 // We are checking for shuffle match or shift match. Loop twice so we can
39487 // order which we try and match first depending on target preference.
39488 for (unsigned Order = 0; Order < 2; ++Order) {
39489 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39490 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39491 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39492 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39493 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39494 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39495 SmallVector<int, 4> RepeatedMask;
39496 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39497 // Narrow the repeated mask to create 32-bit element permutes.
39498 SmallVector<int, 4> WordMask = RepeatedMask;
39499 if (MaskScalarSizeInBits == 64)
39500 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39501
39502 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39503 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39504 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39505 PermuteImm = getV4X86ShuffleImm(WordMask);
39506 return true;
39507 }
39508 }
39509
39510 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39511 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39512 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39513 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39514 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39515 SmallVector<int, 4> RepeatedMask;
39516 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39517 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39518 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39519
39520 // PSHUFLW: permute lower 4 elements only.
39521 if (isUndefOrInRange(LoMask, 0, 4) &&
39522 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39523 Shuffle = X86ISD::PSHUFLW;
39524 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39525 PermuteImm = getV4X86ShuffleImm(LoMask);
39526 return true;
39527 }
39528
39529 // PSHUFHW: permute upper 4 elements only.
39530 if (isUndefOrInRange(HiMask, 4, 8) &&
39531 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39532 // Offset the HiMask so that we can create the shuffle immediate.
39533 int OffsetHiMask[4];
39534 for (int i = 0; i != 4; ++i)
39535 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39536
39537 Shuffle = X86ISD::PSHUFHW;
39538 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39539 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39540 return true;
39541 }
39542 }
39543 }
39544 } else {
39545 // Attempt to match against bit rotates.
39546 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39547 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39548 Subtarget.hasAVX512())) {
39549 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39550 Subtarget, Mask);
39551 if (0 < RotateAmt) {
39552 Shuffle = X86ISD::VROTLI;
39553 PermuteImm = (unsigned)RotateAmt;
39554 return true;
39555 }
39556 }
39557 }
39558 // Attempt to match against byte/bit shifts.
39559 if (AllowIntDomain &&
39560 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39561 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39562 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39563 int ShiftAmt =
39564 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39565 Zeroable, Subtarget);
39566 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39567 32 <= ShuffleVT.getScalarSizeInBits())) {
39568 // Byte shifts can be slower so only match them on second attempt.
39569 if (Order == 0 &&
39570 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39571 continue;
39572
39573 PermuteImm = (unsigned)ShiftAmt;
39574 return true;
39575 }
39576
39577 }
39578 }
39579
39580 return false;
39581}
39582
39583// Attempt to match a combined unary shuffle mask against supported binary
39584// shuffle instructions.
39585// TODO: Investigate sharing more of this with shuffle lowering.
39586static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39587 bool AllowFloatDomain, bool AllowIntDomain,
39588 SDValue &V1, SDValue &V2, const SDLoc &DL,
39589 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39590 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39591 bool IsUnary) {
39592 unsigned NumMaskElts = Mask.size();
39593 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39594 unsigned SizeInBits = MaskVT.getSizeInBits();
39595
39596 if (MaskVT.is128BitVector()) {
39597 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39598 AllowFloatDomain) {
39599 V2 = V1;
39600 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39601 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39602 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39603 return true;
39604 }
39605 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39606 AllowFloatDomain) {
39607 V2 = V1;
39608 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39609 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39610 return true;
39611 }
39612 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39613 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39614 std::swap(V1, V2);
39615 Shuffle = X86ISD::MOVSD;
39616 SrcVT = DstVT = MVT::v2f64;
39617 return true;
39618 }
39619 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39620 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39621 Shuffle = X86ISD::MOVSS;
39622 SrcVT = DstVT = MVT::v4f32;
39623 return true;
39624 }
39625 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39626 DAG) &&
39627 Subtarget.hasFP16()) {
39628 Shuffle = X86ISD::MOVSH;
39629 SrcVT = DstVT = MVT::v8f16;
39630 return true;
39631 }
39632 }
39633
39634 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39635 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39636 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39637 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39638 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39639 Subtarget)) {
39640 DstVT = MaskVT;
39641 return true;
39642 }
39643 }
39644 // TODO: Can we handle this inside matchShuffleWithPACK?
39645 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39646 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39647 V1.getScalarValueSizeInBits() == 64 &&
39648 V2.getScalarValueSizeInBits() == 64) {
39649 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39650 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39651 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39652 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39653 SrcVT = MVT::v4i32;
39654 DstVT = MVT::v8i16;
39655 Shuffle = X86ISD::PACKUS;
39656 return true;
39657 }
39658 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39659 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39660 SrcVT = MVT::v8i16;
39661 DstVT = MVT::v16i8;
39662 Shuffle = X86ISD::PACKUS;
39663 return true;
39664 }
39665 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39666 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39667 SrcVT = MVT::v4i32;
39668 DstVT = MVT::v8i16;
39669 Shuffle = X86ISD::PACKSS;
39670 return true;
39671 }
39672 }
39673
39674 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39675 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39676 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39677 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39678 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39679 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39680 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39681 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39682 Subtarget)) {
39683 SrcVT = DstVT = MaskVT;
39684 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39685 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39686 return true;
39687 }
39688 }
39689
39690 // Attempt to match against a OR if we're performing a blend shuffle and the
39691 // non-blended source element is zero in each case.
39692 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39693 if (SizeInBits == V1.getValueSizeInBits() &&
39694 SizeInBits == V2.getValueSizeInBits() &&
39695 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39696 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39697 bool IsBlend = true;
39698 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39699 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39700 unsigned Scale1 = NumV1Elts / NumMaskElts;
39701 unsigned Scale2 = NumV2Elts / NumMaskElts;
39702 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39703 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39704 for (unsigned i = 0; i != NumMaskElts; ++i) {
39705 int M = Mask[i];
39706 if (M == SM_SentinelUndef)
39707 continue;
39708 if (M == SM_SentinelZero) {
39709 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39710 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39711 continue;
39712 }
39713 if (M == (int)i) {
39714 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39715 continue;
39716 }
39717 if (M == (int)(i + NumMaskElts)) {
39718 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39719 continue;
39720 }
39721 IsBlend = false;
39722 break;
39723 }
39724 if (IsBlend) {
39725 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39726 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39727 Shuffle = ISD::OR;
39728 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39729 return true;
39730 }
39731 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39732 // FIXME: handle mismatched sizes?
39733 // TODO: investigate if `ISD::OR` handling in
39734 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39735 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39736 unsigned NumElts = V.getValueType().getVectorNumElements();
39737 KnownBits Known(NumElts);
39738 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39739 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39740 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39741 if (PeepholeKnown.isZero())
39742 Known.Zero.setBit(EltIdx);
39743 if (PeepholeKnown.isAllOnes())
39744 Known.One.setBit(EltIdx);
39745 }
39746 return Known;
39747 };
39748
39749 KnownBits V1Known = computeKnownBitsElementWise(V1);
39750 KnownBits V2Known = computeKnownBitsElementWise(V2);
39751
39752 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39753 int M = Mask[i];
39754 if (M == SM_SentinelUndef)
39755 continue;
39756 if (M == SM_SentinelZero) {
39757 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39758 continue;
39759 }
39760 if (M == (int)i) {
39761 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39762 continue;
39763 }
39764 if (M == (int)(i + NumMaskElts)) {
39765 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39766 continue;
39767 }
39768 llvm_unreachable("will not get here.");
39769 }
39770 if (IsBlend) {
39771 Shuffle = ISD::OR;
39772 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39773 return true;
39774 }
39775 }
39776 }
39777 }
39778
39779 return false;
39780}
39781
39783 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39784 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39785 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39786 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39787 unsigned NumMaskElts = Mask.size();
39788 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39789
39790 // Attempt to match against VALIGND/VALIGNQ rotate.
39791 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39792 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39793 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39794 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39795 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39796 MaskVT.getSizeInBits() / EltSizeInBits);
39797 if (!isAnyZero(Mask)) {
39798 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39799 if (0 < Rotation) {
39800 Shuffle = X86ISD::VALIGN;
39801 ShuffleVT = AlignVT;
39802 PermuteImm = Rotation;
39803 return true;
39804 }
39805 }
39806 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39807 unsigned ZeroLo = Zeroable.countr_one();
39808 unsigned ZeroHi = Zeroable.countl_one();
39809 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39810 if (ZeroLo) {
39811 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39812 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39813 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39814 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39815 Shuffle = X86ISD::VALIGN;
39816 ShuffleVT = AlignVT;
39817 PermuteImm = NumMaskElts - ZeroLo;
39818 return true;
39819 }
39820 }
39821 if (ZeroHi) {
39822 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39823 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39824 ZeroHi);
39825 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39826 V2 = V1;
39827 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39828 Shuffle = X86ISD::VALIGN;
39829 ShuffleVT = AlignVT;
39830 PermuteImm = ZeroHi;
39831 return true;
39832 }
39833 }
39834 }
39835
39836 // Attempt to match against PALIGNR byte rotate.
39837 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39838 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39839 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39840 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39841 if (0 < ByteRotation) {
39842 Shuffle = X86ISD::PALIGNR;
39843 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39844 PermuteImm = ByteRotation;
39845 return true;
39846 }
39847 }
39848
39849 // Attempt to combine to X86ISD::BLENDI.
39850 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39851 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39852 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39853 uint64_t BlendMask = 0;
39854 bool ForceV1Zero = false, ForceV2Zero = false;
39855 SmallVector<int, 8> TargetMask(Mask);
39856 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39857 ForceV2Zero, BlendMask)) {
39858 if (MaskVT == MVT::v16i16) {
39859 // We can only use v16i16 PBLENDW if the lanes are repeated.
39860 SmallVector<int, 8> RepeatedMask;
39861 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39862 RepeatedMask)) {
39863 assert(RepeatedMask.size() == 8 &&
39864 "Repeated mask size doesn't match!");
39865 PermuteImm = 0;
39866 for (int i = 0; i < 8; ++i)
39867 if (RepeatedMask[i] >= 8)
39868 PermuteImm |= 1 << i;
39869 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39870 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39871 Shuffle = X86ISD::BLENDI;
39872 ShuffleVT = MaskVT;
39873 return true;
39874 }
39875 } else {
39876 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39877 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39878 PermuteImm = (unsigned)BlendMask;
39879 Shuffle = X86ISD::BLENDI;
39880 ShuffleVT = MaskVT;
39881 return true;
39882 }
39883 }
39884 }
39885
39886 // Attempt to combine to INSERTPS, but only if it has elements that need to
39887 // be set to zero.
39888 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39889 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39890 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39891 Shuffle = X86ISD::INSERTPS;
39892 ShuffleVT = MVT::v4f32;
39893 return true;
39894 }
39895
39896 // Attempt to combine to SHUFPD.
39897 if (AllowFloatDomain && EltSizeInBits == 64 &&
39898 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39899 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39900 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39901 bool ForceV1Zero = false, ForceV2Zero = false;
39902 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39903 PermuteImm, Mask, Zeroable)) {
39904 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39905 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39906 Shuffle = X86ISD::SHUFP;
39907 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39908 return true;
39909 }
39910 }
39911
39912 // Attempt to combine to SHUFPS.
39913 if (AllowFloatDomain && EltSizeInBits == 32 &&
39914 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39915 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39916 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39917 SmallVector<int, 4> RepeatedMask;
39918 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39919 // Match each half of the repeated mask, to determine if its just
39920 // referencing one of the vectors, is zeroable or entirely undef.
39921 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39922 int M0 = RepeatedMask[Offset];
39923 int M1 = RepeatedMask[Offset + 1];
39924
39925 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39926 return DAG.getUNDEF(MaskVT);
39927 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39928 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39929 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39930 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39931 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39932 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39933 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39934 return V1;
39935 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39936 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39937 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39938 return V2;
39939 }
39940
39941 return SDValue();
39942 };
39943
39944 int ShufMask[4] = {-1, -1, -1, -1};
39945 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39946 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39947
39948 if (Lo && Hi) {
39949 V1 = Lo;
39950 V2 = Hi;
39951 Shuffle = X86ISD::SHUFP;
39952 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39953 PermuteImm = getV4X86ShuffleImm(ShufMask);
39954 return true;
39955 }
39956 }
39957 }
39958
39959 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39960 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39961 MaskVT.is128BitVector() &&
39962 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39963 Shuffle = X86ISD::INSERTPS;
39964 ShuffleVT = MVT::v4f32;
39965 return true;
39966 }
39967
39968 return false;
39969}
39970
39972 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39973 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39974 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39975 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39976 const X86Subtarget &Subtarget);
39977
39978/// Combine an arbitrary chain of shuffles into a single instruction if
39979/// possible.
39980///
39981/// This is the leaf of the recursive combine below. When we have found some
39982/// chain of single-use x86 shuffle instructions and accumulated the combined
39983/// shuffle mask represented by them, this will try to pattern match that mask
39984/// into either a single instruction if there is a special purpose instruction
39985/// for this operation, or into a PSHUFB instruction which is a fully general
39986/// instruction but should only be used to replace chains over a certain depth.
39988 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39989 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39990 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39991 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39992 const X86Subtarget &Subtarget) {
39993 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39994 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39995 "Unexpected number of shuffle inputs!");
39996 unsigned RootSizeInBits = RootVT.getSizeInBits();
39997 unsigned NumRootElts = RootVT.getVectorNumElements();
39998
39999 // Canonicalize shuffle input op to the requested type.
40000 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
40001 if (VT.getSizeInBits() > Op.getValueSizeInBits())
40002 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
40003 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
40004 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
40005 return DAG.getBitcast(VT, Op);
40006 };
40007
40008 // Find the inputs that enter the chain. Note that multiple uses are OK
40009 // here, we're not going to remove the operands we find.
40010 bool UnaryShuffle = (Inputs.size() == 1);
40011 SDValue V1 = peekThroughBitcasts(Inputs[0]);
40012 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
40013 : peekThroughBitcasts(Inputs[1]));
40014
40015 MVT VT1 = V1.getSimpleValueType();
40016 MVT VT2 = V2.getSimpleValueType();
40017 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
40018 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
40019
40020 SDValue Res;
40021
40022 unsigned NumBaseMaskElts = BaseMask.size();
40023 if (NumBaseMaskElts == 1) {
40024 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
40025 return CanonicalizeShuffleInput(RootVT, V1);
40026 }
40027
40028 bool OptForSize = DAG.shouldOptForSize();
40029 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
40030 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40031 (RootVT.isFloatingPoint() && Depth >= 1) ||
40032 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40033
40034 // If we are shuffling a splat (and not introducing zeros) then we can just
40035 // use it directly. This works for smaller elements as well as they already
40036 // repeat across each mask element.
40037 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40038 V1.getValueSizeInBits() >= RootSizeInBits &&
40039 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40040 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40041 return CanonicalizeShuffleInput(RootVT, V1);
40042 }
40043
40044 SmallVector<int, 64> Mask(BaseMask);
40045
40046 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40047 // etc. can be simplified.
40048 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40049 SmallVector<int> ScaledMask, IdentityMask;
40050 unsigned NumElts = VT1.getVectorNumElements();
40051 if (Mask.size() <= NumElts &&
40052 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40053 for (unsigned i = 0; i != NumElts; ++i)
40054 IdentityMask.push_back(i);
40055 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40056 V2))
40057 return CanonicalizeShuffleInput(RootVT, V1);
40058 }
40059 }
40060
40061 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40062 if (RootVT.is512BitVector() &&
40063 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40064 // If the upper subvectors are zeroable, then an extract+insert is more
40065 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40066 // to zero the upper subvectors.
40067 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40068 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40069 return SDValue(); // Nothing to do!
40070 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40071 "Unexpected lane shuffle");
40072 Res = CanonicalizeShuffleInput(RootVT, V1);
40073 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40074 bool UseZero = isAnyZero(Mask);
40075 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40076 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40077 }
40078
40079 // Narrow shuffle mask to v4x128.
40080 SmallVector<int, 4> ScaledMask;
40081 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40082 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40083
40084 // Try to lower to vshuf64x2/vshuf32x4.
40085 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40086 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40087 SelectionDAG &DAG) {
40088 int PermMask[4] = {-1, -1, -1, -1};
40089 // Ensure elements came from the same Op.
40090 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40091 for (int i = 0; i < 4; ++i) {
40092 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40093 if (ScaledMask[i] < 0)
40094 continue;
40095
40096 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40097 unsigned OpIndex = i / 2;
40098 if (Ops[OpIndex].isUndef())
40099 Ops[OpIndex] = Op;
40100 else if (Ops[OpIndex] != Op)
40101 return SDValue();
40102
40103 PermMask[i] = ScaledMask[i] % 4;
40104 }
40105
40106 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40107 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40108 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40109 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40110 };
40111
40112 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40113 // doesn't work because our mask is for 128 bits and we don't have an MVT
40114 // to match that.
40115 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40116 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40117 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40118 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40119 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40120 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40121 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40122 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40123 ScaledMask[1] == (ScaledMask[3] % 2));
40124
40125 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40126 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40127 return SDValue(); // Nothing to do!
40128 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40129 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40130 return DAG.getBitcast(RootVT, V);
40131 }
40132 }
40133
40134 // Handle 128-bit lane shuffles of 256-bit vectors.
40135 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40136 // If the upper half is zeroable, then an extract+insert is more optimal
40137 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40138 // zero the upper half.
40139 if (isUndefOrZero(Mask[1])) {
40140 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40141 return SDValue(); // Nothing to do!
40142 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40143 Res = CanonicalizeShuffleInput(RootVT, V1);
40144 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40145 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40146 256);
40147 }
40148
40149 // If we're inserting the low subvector, an insert-subvector 'concat'
40150 // pattern is quicker than VPERM2X128.
40151 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40152 !Subtarget.hasAVX2()) {
40153 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40154 return SDValue(); // Nothing to do!
40155 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40156 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40157 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40158 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40159 }
40160
40161 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40162 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40163 // feature.
40164 // Prefer blends for sequential shuffles unless we are optimizing for size.
40165 if (UnaryShuffle &&
40166 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40167 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40168 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40169 return SDValue(); // Nothing to do!
40170 unsigned PermMask = 0;
40171 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40172 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40173 return DAG.getNode(
40174 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40175 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40176 }
40177
40178 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40179 return SDValue(); // Nothing to do!
40180
40181 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40182 if (!UnaryShuffle && !IsMaskedShuffle) {
40183 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40184 "Unexpected shuffle sentinel value");
40185 // Prefer blends to X86ISD::VPERM2X128.
40186 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40187 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40188 return SDValue(); // Nothing to do!
40189 unsigned PermMask = 0;
40190 PermMask |= ((Mask[0] & 3) << 0);
40191 PermMask |= ((Mask[1] & 3) << 4);
40192 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40193 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40194 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40195 CanonicalizeShuffleInput(RootVT, LHS),
40196 CanonicalizeShuffleInput(RootVT, RHS),
40197 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40198 }
40199 }
40200 }
40201
40202 // For masks that have been widened to 128-bit elements or more,
40203 // narrow back down to 64-bit elements.
40204 if (BaseMaskEltSizeInBits > 64) {
40205 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40206 int MaskScale = BaseMaskEltSizeInBits / 64;
40207 SmallVector<int, 64> ScaledMask;
40208 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40209 Mask = std::move(ScaledMask);
40210 }
40211
40212 // For masked shuffles, we're trying to match the root width for better
40213 // writemask folding, attempt to scale the mask.
40214 // TODO - variable shuffles might need this to be widened again.
40215 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40216 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40217 int MaskScale = NumRootElts / Mask.size();
40218 SmallVector<int, 64> ScaledMask;
40219 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40220 Mask = std::move(ScaledMask);
40221 }
40222
40223 unsigned NumMaskElts = Mask.size();
40224 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40226
40227 // Determine the effective mask value type.
40228 FloatDomain &= (32 <= MaskEltSizeInBits);
40229 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40230 : MVT::getIntegerVT(MaskEltSizeInBits);
40231 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40232
40233 // Only allow legal mask types.
40234 if (!TLI.isTypeLegal(MaskVT))
40235 return SDValue();
40236
40237 // Attempt to match the mask against known shuffle patterns.
40238 MVT ShuffleSrcVT, ShuffleVT;
40239 unsigned Shuffle, PermuteImm;
40240
40241 // Which shuffle domains are permitted?
40242 // Permit domain crossing at higher combine depths.
40243 // TODO: Should we indicate which domain is preferred if both are allowed?
40244 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40245 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40246 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40247
40248 // Determine zeroable mask elements.
40249 APInt KnownUndef, KnownZero;
40250 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40251 APInt Zeroable = KnownUndef | KnownZero;
40252
40253 if (UnaryShuffle) {
40254 // Attempt to match against broadcast-from-vector.
40255 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40256 if ((Subtarget.hasAVX2() ||
40257 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40258 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40259 if (isUndefOrEqual(Mask, 0)) {
40260 if (V1.getValueType() == MaskVT &&
40262 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40263 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40264 return SDValue(); // Nothing to do!
40265 Res = V1.getOperand(0);
40266 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40267 return DAG.getBitcast(RootVT, Res);
40268 }
40269 if (Subtarget.hasAVX2()) {
40270 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40271 return SDValue(); // Nothing to do!
40272 Res = CanonicalizeShuffleInput(MaskVT, V1);
40273 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40274 return DAG.getBitcast(RootVT, Res);
40275 }
40276 }
40277 }
40278
40279 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40280 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40281 (!IsMaskedShuffle ||
40282 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40283 if (Depth == 0 && RootOpc == Shuffle)
40284 return SDValue(); // Nothing to do!
40285 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40286 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40287 return DAG.getBitcast(RootVT, Res);
40288 }
40289
40290 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40291 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40292 PermuteImm) &&
40293 (!IsMaskedShuffle ||
40294 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40295 if (Depth == 0 && RootOpc == Shuffle)
40296 return SDValue(); // Nothing to do!
40297 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40298 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40299 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40300 return DAG.getBitcast(RootVT, Res);
40301 }
40302 }
40303
40304 // Attempt to combine to INSERTPS, but only if the inserted element has come
40305 // from a scalar.
40306 // TODO: Handle other insertions here as well?
40307 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40308 Subtarget.hasSSE41() &&
40309 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40310 if (MaskEltSizeInBits == 32) {
40311 SDValue SrcV1 = V1, SrcV2 = V2;
40312 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40313 DAG) &&
40314 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40315 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40316 return SDValue(); // Nothing to do!
40317 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40318 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40319 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40320 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40321 return DAG.getBitcast(RootVT, Res);
40322 }
40323 }
40324 if (MaskEltSizeInBits == 64 &&
40325 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40327 V2.getScalarValueSizeInBits() <= 32) {
40328 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40329 return SDValue(); // Nothing to do!
40330 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40331 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40332 CanonicalizeShuffleInput(MVT::v4f32, V1),
40333 CanonicalizeShuffleInput(MVT::v4f32, V2),
40334 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40335 return DAG.getBitcast(RootVT, Res);
40336 }
40337 }
40338
40339 SDValue NewV1 = V1; // Save operands in case early exit happens.
40340 SDValue NewV2 = V2;
40341 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40342 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40343 ShuffleVT, UnaryShuffle) &&
40344 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40345 if (Depth == 0 && RootOpc == Shuffle)
40346 return SDValue(); // Nothing to do!
40347 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40348 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40349 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40350 return DAG.getBitcast(RootVT, Res);
40351 }
40352
40353 NewV1 = V1; // Save operands in case early exit happens.
40354 NewV2 = V2;
40355 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40356 AllowIntDomain, NewV1, NewV2, DL, DAG,
40357 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40358 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40359 if (Depth == 0 && RootOpc == Shuffle)
40360 return SDValue(); // Nothing to do!
40361 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40362 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40363 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40364 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40365 return DAG.getBitcast(RootVT, Res);
40366 }
40367
40368 // Typically from here on, we need an integer version of MaskVT.
40369 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40370 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40371
40372 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40373 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40374 uint64_t BitLen, BitIdx;
40375 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40376 Zeroable)) {
40377 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40378 return SDValue(); // Nothing to do!
40379 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40380 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40381 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40382 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40383 return DAG.getBitcast(RootVT, Res);
40384 }
40385
40386 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40387 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40388 return SDValue(); // Nothing to do!
40389 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40390 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40391 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40392 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40393 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40394 return DAG.getBitcast(RootVT, Res);
40395 }
40396 }
40397
40398 // Match shuffle against TRUNCATE patterns.
40399 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40400 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40401 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40402 Subtarget)) {
40403 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40404 ShuffleSrcVT.getVectorNumElements();
40405 unsigned Opc =
40407 if (Depth == 0 && RootOpc == Opc)
40408 return SDValue(); // Nothing to do!
40409 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40410 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40411 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40412 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40413 return DAG.getBitcast(RootVT, Res);
40414 }
40415
40416 // Do we need a more general binary truncation pattern?
40417 if (RootSizeInBits < 512 &&
40418 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40419 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40420 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40421 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40422 // Bail if this was already a truncation or PACK node.
40423 // We sometimes fail to match PACK if we demand known undef elements.
40424 if (Depth == 0 &&
40425 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40426 RootOpc == X86ISD::PACKUS))
40427 return SDValue(); // Nothing to do!
40428 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40429 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40430 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40431 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40432 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40433 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40434 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40435 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40436 return DAG.getBitcast(RootVT, Res);
40437 }
40438 }
40439
40440 // Don't try to re-form single instruction chains under any circumstances now
40441 // that we've done encoding canonicalization for them.
40442 if (Depth < 1)
40443 return SDValue();
40444
40445 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40446 return isTargetShuffleVariableMask(N->getOpcode());
40447 });
40448 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40449 return (N->getOpcode() == X86ISD::VPERMV3 ||
40450 N->getOpcode() == X86ISD::VPERMV);
40451 });
40452
40453 // Depth threshold above which we can efficiently use variable mask shuffles.
40454 int VariableCrossLaneShuffleDepth =
40455 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40456 int VariablePerLaneShuffleDepth =
40457 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40458 AllowVariableCrossLaneMask &=
40459 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40460 AllowVariablePerLaneMask &=
40461 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40462 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40463 // higher depth before combining them.
40464 int BWIVPERMV3ShuffleDepth =
40465 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40466 bool AllowBWIVPERMV3 =
40467 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40468
40469 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40470 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40471 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40472
40473 bool MaskContainsZeros = isAnyZero(Mask);
40474
40475 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40476 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40477 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40478 if (Subtarget.hasAVX2() &&
40479 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40480 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40481 Res = CanonicalizeShuffleInput(MaskVT, V1);
40482 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40483 return DAG.getBitcast(RootVT, Res);
40484 }
40485 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40486 if ((Subtarget.hasAVX512() &&
40487 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40488 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40489 (Subtarget.hasBWI() &&
40490 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40491 (Subtarget.hasVBMI() &&
40492 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40493 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40494 V2 = DAG.getUNDEF(MaskVT);
40495 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40496 return DAG.getBitcast(RootVT, Res);
40497 }
40498 }
40499
40500 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40501 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40502 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40503 ((Subtarget.hasAVX512() &&
40504 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40505 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40506 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40507 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40508 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40509 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40510 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40511 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40512 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40513 for (unsigned i = 0; i != NumMaskElts; ++i)
40514 if (Mask[i] == SM_SentinelZero)
40515 Mask[i] = NumMaskElts + i;
40516 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40517 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40518 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40519 return DAG.getBitcast(RootVT, Res);
40520 }
40521
40522 // If that failed and either input is extracted then try to combine as a
40523 // shuffle with the larger type.
40525 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40526 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40527 IsMaskedShuffle, DAG, DL, Subtarget))
40528 return WideShuffle;
40529
40530 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40531 // (non-VLX will pad to 512-bit shuffles).
40532 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40533 ((Subtarget.hasAVX512() &&
40534 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40535 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40536 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40537 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40538 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40539 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40540 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40541 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40542 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40543 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40544 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40545 return DAG.getBitcast(RootVT, Res);
40546 }
40547 return SDValue();
40548 }
40549
40550 // See if we can combine a single input shuffle with zeros to a bit-mask,
40551 // which is much simpler than any shuffle.
40552 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40553 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40554 TLI.isTypeLegal(MaskVT)) {
40555 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40556 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40557 APInt UndefElts(NumMaskElts, 0);
40558 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40559 for (unsigned i = 0; i != NumMaskElts; ++i) {
40560 int M = Mask[i];
40561 if (M == SM_SentinelUndef) {
40562 UndefElts.setBit(i);
40563 continue;
40564 }
40565 if (M == SM_SentinelZero)
40566 continue;
40567 EltBits[i] = AllOnes;
40568 }
40569 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40570 Res = CanonicalizeShuffleInput(MaskVT, V1);
40571 unsigned AndOpcode =
40573 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40574 return DAG.getBitcast(RootVT, Res);
40575 }
40576
40577 // If we have a single input shuffle with different shuffle patterns in the
40578 // the 128-bit lanes use the variable mask to VPERMILPS.
40579 // TODO Combine other mask types at higher depths.
40580 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40581 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40582 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40583 SmallVector<SDValue, 16> VPermIdx;
40584 for (int M : Mask) {
40585 SDValue Idx =
40586 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40587 VPermIdx.push_back(Idx);
40588 }
40589 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40590 Res = CanonicalizeShuffleInput(MaskVT, V1);
40591 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40592 return DAG.getBitcast(RootVT, Res);
40593 }
40594
40595 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40596 // to VPERMIL2PD/VPERMIL2PS.
40597 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40598 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40599 MaskVT == MVT::v8f32)) {
40600 // VPERMIL2 Operation.
40601 // Bits[3] - Match Bit.
40602 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40603 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40604 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40605 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40606 SmallVector<int, 8> VPerm2Idx;
40607 unsigned M2ZImm = 0;
40608 for (int M : Mask) {
40609 if (M == SM_SentinelUndef) {
40610 VPerm2Idx.push_back(-1);
40611 continue;
40612 }
40613 if (M == SM_SentinelZero) {
40614 M2ZImm = 2;
40615 VPerm2Idx.push_back(8);
40616 continue;
40617 }
40618 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40619 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40620 VPerm2Idx.push_back(Index);
40621 }
40622 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40623 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40624 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40625 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40626 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40627 return DAG.getBitcast(RootVT, Res);
40628 }
40629
40630 // If we have 3 or more shuffle instructions or a chain involving a variable
40631 // mask, we can replace them with a single PSHUFB instruction profitably.
40632 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40633 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40634 // more aggressive.
40635 if (UnaryShuffle && AllowVariablePerLaneMask &&
40636 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40637 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40638 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40639 SmallVector<SDValue, 16> PSHUFBMask;
40640 int NumBytes = RootVT.getSizeInBits() / 8;
40641 int Ratio = NumBytes / NumMaskElts;
40642 for (int i = 0; i < NumBytes; ++i) {
40643 int M = Mask[i / Ratio];
40644 if (M == SM_SentinelUndef) {
40645 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40646 continue;
40647 }
40648 if (M == SM_SentinelZero) {
40649 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40650 continue;
40651 }
40652 M = Ratio * M + i % Ratio;
40653 assert((M / 16) == (i / 16) && "Lane crossing detected");
40654 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40655 }
40656 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40657 Res = CanonicalizeShuffleInput(ByteVT, V1);
40658 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40659 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40660 return DAG.getBitcast(RootVT, Res);
40661 }
40662
40663 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40664 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40665 // slower than PSHUFB on targets that support both.
40666 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40667 Subtarget.hasXOP()) {
40668 // VPPERM Mask Operation
40669 // Bits[4:0] - Byte Index (0 - 31)
40670 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40671 SmallVector<SDValue, 16> VPPERMMask;
40672 int NumBytes = 16;
40673 int Ratio = NumBytes / NumMaskElts;
40674 for (int i = 0; i < NumBytes; ++i) {
40675 int M = Mask[i / Ratio];
40676 if (M == SM_SentinelUndef) {
40677 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40678 continue;
40679 }
40680 if (M == SM_SentinelZero) {
40681 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40682 continue;
40683 }
40684 M = Ratio * M + i % Ratio;
40685 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40686 }
40687 MVT ByteVT = MVT::v16i8;
40688 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40689 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40690 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40691 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40692 return DAG.getBitcast(RootVT, Res);
40693 }
40694
40695 // If that failed and either input is extracted then try to combine as a
40696 // shuffle with the larger type.
40698 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40699 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40700 DAG, DL, Subtarget))
40701 return WideShuffle;
40702
40703 // If we have a dual input shuffle then lower to VPERMV3,
40704 // (non-VLX will pad to 512-bit shuffles)
40705 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40706 ((Subtarget.hasAVX512() &&
40707 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40708 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40709 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40710 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40711 MaskVT == MVT::v16i32)) ||
40712 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40713 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40714 MaskVT == MVT::v32i16)) ||
40715 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40716 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40717 MaskVT == MVT::v64i8)))) {
40718 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40719 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40720 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40721 return DAG.getBitcast(RootVT, Res);
40722 }
40723
40724 // Failed to find any combines.
40725 return SDValue();
40726}
40727
40728// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40729// instruction if possible.
40730//
40731// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40732// type size to attempt to combine:
40733// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40734// -->
40735// extract_subvector(shuffle(x,y,m2),0)
40737 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40738 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40739 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40740 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40741 const X86Subtarget &Subtarget) {
40742 unsigned NumMaskElts = BaseMask.size();
40743 unsigned NumInputs = Inputs.size();
40744 if (NumInputs == 0)
40745 return SDValue();
40746
40747 unsigned RootSizeInBits = RootVT.getSizeInBits();
40748 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40749 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40750
40751 // Peek through subvectors to find widest legal vector.
40752 // TODO: Handle ISD::TRUNCATE
40753 unsigned WideSizeInBits = RootSizeInBits;
40754 for (SDValue Input : Inputs) {
40756 while (1) {
40757 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40758 Input = peekThroughBitcasts(Input.getOperand(0));
40759 continue;
40760 }
40761 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40762 Input.getOperand(0).isUndef() &&
40763 isNullConstant(Input.getOperand(2))) {
40764 Input = peekThroughBitcasts(Input.getOperand(1));
40765 continue;
40766 }
40767 break;
40768 }
40769 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40770 WideSizeInBits < Input.getValueSizeInBits())
40771 WideSizeInBits = Input.getValueSizeInBits();
40772 }
40773
40774 // Bail if we fail to find a source larger than the existing root.
40775 if (WideSizeInBits <= RootSizeInBits ||
40776 (WideSizeInBits % RootSizeInBits) != 0)
40777 return SDValue();
40778
40779 // Create new mask for larger type.
40780 SmallVector<int, 64> WideMask;
40781 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40782
40783 // Attempt to peek through inputs and adjust mask when we extract from an
40784 // upper subvector.
40785 int AdjustedMasks = 0;
40786 SmallVector<SDValue, 4> WideInputs(Inputs);
40787 for (unsigned I = 0; I != NumInputs; ++I) {
40788 SDValue &Input = WideInputs[I];
40790 while (1) {
40791 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40792 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40793 uint64_t Idx = Input.getConstantOperandVal(1);
40794 if (Idx != 0) {
40795 ++AdjustedMasks;
40796 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40797 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40798
40799 int lo = I * WideMask.size();
40800 int hi = (I + 1) * WideMask.size();
40801 for (int &M : WideMask)
40802 if (lo <= M && M < hi)
40803 M += Idx;
40804 }
40805 Input = peekThroughBitcasts(Input.getOperand(0));
40806 continue;
40807 }
40808 // TODO: Handle insertions into upper subvectors.
40809 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40810 Input.getOperand(0).isUndef() &&
40811 isNullConstant(Input.getOperand(2))) {
40812 Input = peekThroughBitcasts(Input.getOperand(1));
40813 continue;
40814 }
40815 break;
40816 }
40817 }
40818
40819 // Remove unused/repeated shuffle source ops.
40820 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40821 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40822
40823 // Bail if we're always extracting from the lowest subvectors,
40824 // combineX86ShuffleChain should match this for the current width, or the
40825 // shuffle still references too many inputs.
40826 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40827 return SDValue();
40828
40829 // Minor canonicalization of the accumulated shuffle mask to make it easier
40830 // to match below. All this does is detect masks with sequential pairs of
40831 // elements, and shrink them to the half-width mask. It does this in a loop
40832 // so it will reduce the size of the mask to the minimal width mask which
40833 // performs an equivalent shuffle.
40834 while (WideMask.size() > 1) {
40835 SmallVector<int, 64> WidenedMask;
40836 if (!canWidenShuffleElements(WideMask, WidenedMask))
40837 break;
40838 WideMask = std::move(WidenedMask);
40839 }
40840
40841 // Canonicalization of binary shuffle masks to improve pattern matching by
40842 // commuting the inputs.
40843 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40845 std::swap(WideInputs[0], WideInputs[1]);
40846 }
40847
40848 // Increase depth for every upper subvector we've peeked through.
40849 Depth += AdjustedMasks;
40850
40851 // Attempt to combine wider chain.
40852 // TODO: Can we use a better Root?
40853 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40854 WideInputs.back().getValueSizeInBits()
40855 ? WideInputs.front()
40856 : WideInputs.back();
40857 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40858 "WideRootSize mismatch");
40859
40860 if (SDValue WideShuffle = combineX86ShuffleChain(
40861 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40862 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40863 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40864 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40865 return DAG.getBitcast(RootVT, WideShuffle);
40866 }
40867
40868 return SDValue();
40869}
40870
40871// Canonicalize the combined shuffle mask chain with horizontal ops.
40872// NOTE: This may update the Ops and Mask.
40875 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40876 const X86Subtarget &Subtarget) {
40877 if (Mask.empty() || Ops.empty())
40878 return SDValue();
40879
40881 for (SDValue Op : Ops)
40883
40884 // All ops must be the same horizop + type.
40885 SDValue BC0 = BC[0];
40886 EVT VT0 = BC0.getValueType();
40887 unsigned Opcode0 = BC0.getOpcode();
40888 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40889 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40890 }))
40891 return SDValue();
40892
40893 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40894 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40895 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40896 if (!isHoriz && !isPack)
40897 return SDValue();
40898
40899 // Do all ops have a single use?
40900 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40901 return Op.hasOneUse() &&
40903 });
40904
40905 int NumElts = VT0.getVectorNumElements();
40906 int NumLanes = VT0.getSizeInBits() / 128;
40907 int NumEltsPerLane = NumElts / NumLanes;
40908 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40909 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40910 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40911
40912 if (NumEltsPerLane >= 4 &&
40913 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40914 SmallVector<int> LaneMask, ScaledMask;
40915 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40916 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40917 // See if we can remove the shuffle by resorting the HOP chain so that
40918 // the HOP args are pre-shuffled.
40919 // TODO: Generalize to any sized/depth chain.
40920 // TODO: Add support for PACKSS/PACKUS.
40921 if (isHoriz) {
40922 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40923 auto GetHOpSrc = [&](int M) {
40924 if (M == SM_SentinelUndef)
40925 return DAG.getUNDEF(VT0);
40926 if (M == SM_SentinelZero)
40927 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40928 SDValue Src0 = BC[M / 4];
40929 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40930 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40931 return Src1.getOperand(M % 2);
40932 return SDValue();
40933 };
40934 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40935 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40936 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40937 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40938 if (M0 && M1 && M2 && M3) {
40939 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40940 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40941 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40942 }
40943 }
40944 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40945 if (Ops.size() >= 2) {
40946 SDValue LHS, RHS;
40947 auto GetHOpSrc = [&](int M, int &OutM) {
40948 // TODO: Support SM_SentinelZero
40949 if (M < 0)
40950 return M == SM_SentinelUndef;
40951 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40952 if (!LHS || LHS == Src) {
40953 LHS = Src;
40954 OutM = (M % 2);
40955 return true;
40956 }
40957 if (!RHS || RHS == Src) {
40958 RHS = Src;
40959 OutM = (M % 2) + 2;
40960 return true;
40961 }
40962 return false;
40963 };
40964 int PostMask[4] = {-1, -1, -1, -1};
40965 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40966 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40967 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40968 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40969 LHS = DAG.getBitcast(SrcVT, LHS);
40970 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40971 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40972 // Use SHUFPS for the permute so this will work on SSE2 targets,
40973 // shuffle combining and domain handling will simplify this later on.
40974 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40975 Res = DAG.getBitcast(ShuffleVT, Res);
40976 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40977 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40978 }
40979 }
40980 }
40981 }
40982
40983 if (2 < Ops.size())
40984 return SDValue();
40985
40986 SDValue BC1 = BC[BC.size() - 1];
40987 if (Mask.size() == VT0.getVectorNumElements()) {
40988 // Canonicalize binary shuffles of horizontal ops that use the
40989 // same sources to an unary shuffle.
40990 // TODO: Try to perform this fold even if the shuffle remains.
40991 if (Ops.size() == 2) {
40992 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40993 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40994 };
40995 // Commute if all BC0's ops are contained in BC1.
40996 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40997 ContainsOps(BC1, BC0.getOperand(1))) {
40999 std::swap(Ops[0], Ops[1]);
41000 std::swap(BC0, BC1);
41001 }
41002
41003 // If BC1 can be represented by BC0, then convert to unary shuffle.
41004 if (ContainsOps(BC0, BC1.getOperand(0)) &&
41005 ContainsOps(BC0, BC1.getOperand(1))) {
41006 for (int &M : Mask) {
41007 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
41008 continue;
41009 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
41010 M -= NumElts + (SubLane * NumHalfEltsPerLane);
41011 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
41012 M += NumHalfEltsPerLane;
41013 }
41014 }
41015 }
41016
41017 // Canonicalize unary horizontal ops to only refer to lower halves.
41018 for (int i = 0; i != NumElts; ++i) {
41019 int &M = Mask[i];
41020 if (isUndefOrZero(M))
41021 continue;
41022 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
41023 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41024 M -= NumHalfEltsPerLane;
41025 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
41026 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
41027 M -= NumHalfEltsPerLane;
41028 }
41029 }
41030
41031 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41032 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41033 // represents the LHS/RHS inputs for the lower/upper halves.
41034 SmallVector<int, 16> TargetMask128, WideMask128;
41035 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41036 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41037 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41038 bool SingleOp = (Ops.size() == 1);
41039 if (isPack || OneUseOps ||
41040 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41041 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41042 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41043 Lo = Lo.getOperand(WideMask128[0] & 1);
41044 Hi = Hi.getOperand(WideMask128[1] & 1);
41045 if (SingleOp) {
41046 SDValue Undef = DAG.getUNDEF(SrcVT);
41047 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41048 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41049 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41050 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41051 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41052 }
41053 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41054 }
41055 }
41056
41057 // If we are post-shuffling a 256-bit hop and not requiring the upper
41058 // elements, then try to narrow to a 128-bit hop directly.
41059 SmallVector<int, 16> WideMask64;
41060 if (Ops.size() == 1 && NumLanes == 2 &&
41061 scaleShuffleElements(Mask, 4, WideMask64) &&
41062 isUndefInRange(WideMask64, 2, 2)) {
41063 int M0 = WideMask64[0];
41064 int M1 = WideMask64[1];
41065 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41067 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41068 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41069 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41070 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41071 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41072 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41073 }
41074 }
41075
41076 return SDValue();
41077}
41078
41079// Attempt to constant fold all of the constant source ops.
41080// Returns true if the entire shuffle is folded to a constant.
41081// TODO: Extend this to merge multiple constant Ops and update the mask.
41083 ArrayRef<int> Mask,
41084 ArrayRef<const SDNode *> SrcNodes,
41085 SelectionDAG &DAG, const SDLoc &DL,
41086 const X86Subtarget &Subtarget) {
41087 unsigned SizeInBits = VT.getSizeInBits();
41088 unsigned NumMaskElts = Mask.size();
41089 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41090 unsigned NumOps = Ops.size();
41091
41092 // Extract constant bits from each source op.
41093 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41095 for (unsigned I = 0; I != NumOps; ++I)
41096 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41097 RawBitsOps[I],
41098 /*AllowWholeUndefs*/ true,
41099 /*AllowPartialUndefs*/ true))
41100 return SDValue();
41101
41102 // If we're optimizing for size, only fold if at least one of the constants is
41103 // only used once or the combined shuffle has included a variable mask
41104 // shuffle, this is to avoid constant pool bloat.
41105 bool IsOptimizingSize = DAG.shouldOptForSize();
41106 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41107 return isTargetShuffleVariableMask(N->getOpcode());
41108 });
41109 if (IsOptimizingSize && !HasVariableMask &&
41110 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41111 return SDValue();
41112
41113 // Shuffle the constant bits according to the mask.
41114 APInt UndefElts(NumMaskElts, 0);
41115 APInt ZeroElts(NumMaskElts, 0);
41116 APInt ConstantElts(NumMaskElts, 0);
41117 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41118 APInt::getZero(MaskSizeInBits));
41119 for (unsigned i = 0; i != NumMaskElts; ++i) {
41120 int M = Mask[i];
41121 if (M == SM_SentinelUndef) {
41122 UndefElts.setBit(i);
41123 continue;
41124 } else if (M == SM_SentinelZero) {
41125 ZeroElts.setBit(i);
41126 continue;
41127 }
41128 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41129
41130 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41131 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41132
41133 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41134 if (SrcUndefElts[SrcMaskIdx]) {
41135 UndefElts.setBit(i);
41136 continue;
41137 }
41138
41139 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41140 APInt &Bits = SrcEltBits[SrcMaskIdx];
41141 if (!Bits) {
41142 ZeroElts.setBit(i);
41143 continue;
41144 }
41145
41146 ConstantElts.setBit(i);
41147 ConstantBitData[i] = Bits;
41148 }
41149 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41150
41151 // Attempt to create a zero vector.
41152 if ((UndefElts | ZeroElts).isAllOnes())
41153 return getZeroVector(VT, Subtarget, DAG, DL);
41154
41155 // Create the constant data.
41156 MVT MaskSVT;
41157 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41158 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41159 else
41160 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41161
41162 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41163 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41164 return SDValue();
41165
41166 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41167 return DAG.getBitcast(VT, CstOp);
41168}
41169
41170namespace llvm {
41171 namespace X86 {
41172 enum {
41174 };
41175 } // namespace X86
41176} // namespace llvm
41177
41178/// Fully generic combining of x86 shuffle instructions.
41179///
41180/// This should be the last combine run over the x86 shuffle instructions. Once
41181/// they have been fully optimized, this will recursively consider all chains
41182/// of single-use shuffle instructions, build a generic model of the cumulative
41183/// shuffle operation, and check for simpler instructions which implement this
41184/// operation. We use this primarily for two purposes:
41185///
41186/// 1) Collapse generic shuffles to specialized single instructions when
41187/// equivalent. In most cases, this is just an encoding size win, but
41188/// sometimes we will collapse multiple generic shuffles into a single
41189/// special-purpose shuffle.
41190/// 2) Look for sequences of shuffle instructions with 3 or more total
41191/// instructions, and replace them with the slightly more expensive SSSE3
41192/// PSHUFB instruction if available. We do this as the last combining step
41193/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41194/// a suitable short sequence of other instructions. The PSHUFB will either
41195/// use a register or have to read from memory and so is slightly (but only
41196/// slightly) more expensive than the other shuffle instructions.
41197///
41198/// Because this is inherently a quadratic operation (for each shuffle in
41199/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41200/// This should never be an issue in practice as the shuffle lowering doesn't
41201/// produce sequences of more than 8 instructions.
41202///
41203/// FIXME: We will currently miss some cases where the redundant shuffling
41204/// would simplify under the threshold for PSHUFB formation because of
41205/// combine-ordering. To fix this, we should do the redundant instruction
41206/// combining in this recursive walk.
41208 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41209 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41210 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41211 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41212 const SDLoc &DL, const X86Subtarget &Subtarget) {
41213 assert(!RootMask.empty() &&
41214 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41215 "Illegal shuffle root mask");
41216 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41217 unsigned RootSizeInBits = RootVT.getSizeInBits();
41218 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41219
41220 // Bound the depth of our recursive combine because this is ultimately
41221 // quadratic in nature.
41222 if (Depth >= MaxDepth)
41223 return SDValue();
41224
41225 // Directly rip through bitcasts to find the underlying operand.
41226 SDValue Op = SrcOps[SrcOpIndex];
41228
41229 EVT VT = Op.getValueType();
41230 if (!VT.isVector() || !VT.isSimple())
41231 return SDValue(); // Bail if we hit a non-simple non-vector.
41232
41233 // FIXME: Just bail on f16 for now.
41234 if (VT.getVectorElementType() == MVT::f16)
41235 return SDValue();
41236
41237 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41238 "Can only combine shuffles upto size of the root op.");
41239
41240 // Create a demanded elts mask from the referenced elements of Op.
41241 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41242 for (int M : RootMask) {
41243 int BaseIdx = RootMask.size() * SrcOpIndex;
41244 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41245 OpDemandedElts.setBit(M - BaseIdx);
41246 }
41247 if (RootSizeInBits != VT.getSizeInBits()) {
41248 // Op is smaller than Root - extract the demanded elts for the subvector.
41249 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41250 unsigned NumOpMaskElts = RootMask.size() / Scale;
41251 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41252 assert(OpDemandedElts
41253 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41254 .isZero() &&
41255 "Out of range elements referenced in root mask");
41256 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41257 }
41258 OpDemandedElts =
41259 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41260
41261 // Extract target shuffle mask and resolve sentinels and inputs.
41262 SmallVector<int, 64> OpMask;
41263 SmallVector<SDValue, 2> OpInputs;
41264 APInt OpUndef, OpZero;
41265 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41266 OpZero, DAG, Depth, false)) {
41267 // Shuffle inputs must not be larger than the shuffle result.
41268 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41269 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41270 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41271 }))
41272 return SDValue();
41273 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41274 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41275 !isNullConstant(Op.getOperand(1))) {
41276 SDValue SrcVec = Op.getOperand(0);
41277 int ExtractIdx = Op.getConstantOperandVal(1);
41278 unsigned NumElts = VT.getVectorNumElements();
41279 OpInputs.assign({SrcVec});
41280 OpMask.assign(NumElts, SM_SentinelUndef);
41281 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41282 OpZero = OpUndef = APInt::getZero(NumElts);
41283 } else {
41284 return SDValue();
41285 }
41286
41287 // If the shuffle result was smaller than the root, we need to adjust the
41288 // mask indices and pad the mask with undefs.
41289 if (RootSizeInBits > VT.getSizeInBits()) {
41290 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41291 unsigned OpMaskSize = OpMask.size();
41292 if (OpInputs.size() > 1) {
41293 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41294 for (int &M : OpMask) {
41295 if (M < 0)
41296 continue;
41297 int EltIdx = M % OpMaskSize;
41298 int OpIdx = M / OpMaskSize;
41299 M = (PaddedMaskSize * OpIdx) + EltIdx;
41300 }
41301 }
41302 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41303 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41304 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41305 }
41306
41309
41310 // We don't need to merge masks if the root is empty.
41311 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41312 if (EmptyRoot) {
41313 // Only resolve zeros if it will remove an input, otherwise we might end
41314 // up in an infinite loop.
41315 bool ResolveKnownZeros = true;
41316 if (!OpZero.isZero()) {
41317 APInt UsedInputs = APInt::getZero(OpInputs.size());
41318 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41319 int M = OpMask[i];
41320 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41321 continue;
41322 UsedInputs.setBit(M / OpMask.size());
41323 if (UsedInputs.isAllOnes()) {
41324 ResolveKnownZeros = false;
41325 break;
41326 }
41327 }
41328 }
41329 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41330 ResolveKnownZeros);
41331
41332 Mask = OpMask;
41333 Ops.append(OpInputs.begin(), OpInputs.end());
41334 } else {
41335 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41336
41337 // Add the inputs to the Ops list, avoiding duplicates.
41338 Ops.append(SrcOps.begin(), SrcOps.end());
41339
41340 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41341 // Attempt to find an existing match.
41343 for (int i = 0, e = Ops.size(); i < e; ++i)
41344 if (InputBC == peekThroughBitcasts(Ops[i]))
41345 return i;
41346 // Match failed - should we replace an existing Op?
41347 if (InsertionPoint >= 0) {
41349 return InsertionPoint;
41350 }
41351 // Add to the end of the Ops list.
41352 Ops.push_back(Input);
41353 return Ops.size() - 1;
41354 };
41355
41356 SmallVector<int, 2> OpInputIdx;
41357 for (SDValue OpInput : OpInputs)
41358 OpInputIdx.push_back(
41359 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41360
41361 assert(((RootMask.size() > OpMask.size() &&
41362 RootMask.size() % OpMask.size() == 0) ||
41363 (OpMask.size() > RootMask.size() &&
41364 OpMask.size() % RootMask.size() == 0) ||
41365 OpMask.size() == RootMask.size()) &&
41366 "The smaller number of elements must divide the larger.");
41367
41368 // This function can be performance-critical, so we rely on the power-of-2
41369 // knowledge that we have about the mask sizes to replace div/rem ops with
41370 // bit-masks and shifts.
41372 "Non-power-of-2 shuffle mask sizes");
41374 "Non-power-of-2 shuffle mask sizes");
41375 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41376 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41377
41378 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41379 unsigned RootRatio =
41380 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41381 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41382 assert((RootRatio == 1 || OpRatio == 1) &&
41383 "Must not have a ratio for both incoming and op masks!");
41384
41385 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41386 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41387 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41388 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41389 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41390
41391 Mask.resize(MaskWidth, SM_SentinelUndef);
41392
41393 // Merge this shuffle operation's mask into our accumulated mask. Note that
41394 // this shuffle's mask will be the first applied to the input, followed by
41395 // the root mask to get us all the way to the root value arrangement. The
41396 // reason for this order is that we are recursing up the operation chain.
41397 for (unsigned i = 0; i < MaskWidth; ++i) {
41398 unsigned RootIdx = i >> RootRatioLog2;
41399 if (RootMask[RootIdx] < 0) {
41400 // This is a zero or undef lane, we're done.
41401 Mask[i] = RootMask[RootIdx];
41402 continue;
41403 }
41404
41405 unsigned RootMaskedIdx =
41406 RootRatio == 1
41407 ? RootMask[RootIdx]
41408 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41409
41410 // Just insert the scaled root mask value if it references an input other
41411 // than the SrcOp we're currently inserting.
41412 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41413 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41414 Mask[i] = RootMaskedIdx;
41415 continue;
41416 }
41417
41418 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41419 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41420 if (OpMask[OpIdx] < 0) {
41421 // The incoming lanes are zero or undef, it doesn't matter which ones we
41422 // are using.
41423 Mask[i] = OpMask[OpIdx];
41424 continue;
41425 }
41426
41427 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41428 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41429 : (OpMask[OpIdx] << OpRatioLog2) +
41430 (RootMaskedIdx & (OpRatio - 1));
41431
41432 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41433 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41434 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41435 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41436
41437 Mask[i] = OpMaskedIdx;
41438 }
41439 }
41440
41441 // Peek through any free bitcasts to insert_subvector vector widenings or
41442 // extract_subvector nodes back to root size.
41443 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41444 for (auto [I, Op] : enumerate(Ops)) {
41445 SDValue BC = Op;
41446 while (1) {
41447 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41448 BC = BC.getOperand(0);
41449 continue;
41450 }
41451 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41452 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41453 // Set out of bounds mask indices to undef.
41454 Op = BC = BC.getOperand(1);
41455 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41456 int Lo = I * Mask.size();
41457 int Hi = (I + 1) * Mask.size();
41458 int NewHi = Lo + (Mask.size() / Scale);
41459 for (int &M : Mask) {
41460 if (Lo <= M && NewHi <= M && M < Hi)
41461 M = SM_SentinelUndef;
41462 }
41463 continue;
41464 }
41465 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41466 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41467 isNullConstant(BC.getOperand(1))) {
41468 Op = BC = BC.getOperand(0);
41469 continue;
41470 }
41471 break;
41472 }
41473 }
41474
41475 // Remove unused/repeated shuffle source ops.
41477
41478 // Handle the all undef/zero/ones cases early.
41479 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41480 return DAG.getUNDEF(RootVT);
41481 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41482 return getZeroVector(RootVT, Subtarget, DAG, DL);
41483 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41485 return getOnesVector(RootVT, DAG, DL);
41486
41487 assert(!Ops.empty() && "Shuffle with no inputs detected");
41488
41489 // Update the list of shuffle nodes that have been combined so far.
41490 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41491 CombinedNodes.push_back(Op.getNode());
41492
41493 // See if we can recurse into each shuffle source op (if it's a target
41494 // shuffle). The source op should only be generally combined if it either has
41495 // a single use (i.e. current Op) or all its users have already been combined,
41496 // if not then we can still combine but should prevent generation of variable
41497 // shuffles to avoid constant pool bloat.
41498 // Don't recurse if we already have more source ops than we can combine in
41499 // the remaining recursion depth.
41500 if (Ops.size() < (MaxDepth - Depth)) {
41501 for (int i = 0, e = Ops.size(); i < e; ++i) {
41502 // For empty roots, we need to resolve zeroable elements before combining
41503 // them with other shuffles.
41504 SmallVector<int, 64> ResolvedMask = Mask;
41505 if (EmptyRoot)
41506 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41507 bool AllowCrossLaneVar = false;
41508 bool AllowPerLaneVar = false;
41509 if (Ops[i].getNode()->hasOneUse() ||
41510 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41511 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41512 AllowPerLaneVar = AllowVariablePerLaneMask;
41513 }
41515 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41516 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41517 DAG, DL, Subtarget))
41518 return Res;
41519 }
41520 }
41521
41522 // Attempt to constant fold all of the constant source ops.
41524 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41525 return Cst;
41526
41527 // If constant fold failed and we only have constants - then we have
41528 // multiple uses by a single non-variable shuffle - just bail.
41529 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41530 APInt UndefElts;
41531 SmallVector<APInt> RawBits;
41532 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41533 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41534 RawBits,
41535 /*AllowWholeUndefs*/ true,
41536 /*AllowPartialUndefs*/ true);
41537 })) {
41538 return SDValue();
41539 }
41540
41541 // Canonicalize the combined shuffle mask chain with horizontal ops.
41542 // NOTE: This will update the Ops and Mask.
41544 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41545 return DAG.getBitcast(RootVT, HOp);
41546
41547 // Try to refine our inputs given our knowledge of target shuffle mask.
41548 for (auto I : enumerate(Ops)) {
41549 int OpIdx = I.index();
41550 SDValue &Op = I.value();
41551
41552 // What range of shuffle mask element values results in picking from Op?
41553 int Lo = OpIdx * Mask.size();
41554 int Hi = Lo + Mask.size();
41555
41556 // Which elements of Op do we demand, given the mask's granularity?
41557 APInt OpDemandedElts(Mask.size(), 0);
41558 for (int MaskElt : Mask) {
41559 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41560 int OpEltIdx = MaskElt - Lo;
41561 OpDemandedElts.setBit(OpEltIdx);
41562 }
41563 }
41564
41565 // Is the shuffle result smaller than the root?
41566 if (Op.getValueSizeInBits() < RootSizeInBits) {
41567 // We padded the mask with undefs. But we now need to undo that.
41568 unsigned NumExpectedVectorElts = Mask.size();
41569 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41570 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41571 assert(!OpDemandedElts.extractBits(
41572 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41573 "Demanding the virtual undef widening padding?");
41574 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41575 }
41576
41577 // The Op itself may be of different VT, so we need to scale the mask.
41578 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41579 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41580
41581 // Can this operand be simplified any further, given it's demanded elements?
41583 Op, OpScaledDemandedElts, DAG))
41584 Op = NewOp;
41585 }
41586 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41587
41588 // Widen any subvector shuffle inputs we've collected.
41589 // TODO: Remove this to avoid generating temporary nodes, we should only
41590 // widen once combineX86ShuffleChain has found a match.
41591 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41592 return Op.getValueSizeInBits() < RootSizeInBits;
41593 })) {
41594 for (SDValue &Op : Ops)
41595 if (Op.getValueSizeInBits() < RootSizeInBits)
41596 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41597 RootSizeInBits);
41598 // Reresolve - we might have repeated subvector sources.
41600 }
41601
41602 // Handle the all undef/zero/ones cases.
41603 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41604 return DAG.getUNDEF(RootVT);
41605 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41606 return getZeroVector(RootVT, Subtarget, DAG, DL);
41607 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41609 return getOnesVector(RootVT, DAG, DL);
41610
41611 assert(!Ops.empty() && "Shuffle with no inputs detected");
41612
41613 // We can only combine unary and binary shuffle mask cases.
41614 if (Ops.size() <= 2) {
41615 // Minor canonicalization of the accumulated shuffle mask to make it easier
41616 // to match below. All this does is detect masks with sequential pairs of
41617 // elements, and shrink them to the half-width mask. It does this in a loop
41618 // so it will reduce the size of the mask to the minimal width mask which
41619 // performs an equivalent shuffle.
41620 while (Mask.size() > 1) {
41621 SmallVector<int, 64> WidenedMask;
41622 if (!canWidenShuffleElements(Mask, WidenedMask))
41623 break;
41624 Mask = std::move(WidenedMask);
41625 }
41626
41627 // Canonicalization of binary shuffle masks to improve pattern matching by
41628 // commuting the inputs.
41629 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41631 std::swap(Ops[0], Ops[1]);
41632 }
41633
41634 // Try to combine into a single shuffle instruction.
41635 if (SDValue Shuffle = combineX86ShuffleChain(
41636 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41637 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41638 IsMaskedShuffle, DAG, DL, Subtarget))
41639 return Shuffle;
41640
41641 // If all the operands come from the same larger vector, fallthrough and try
41642 // to use combineX86ShuffleChainWithExtract.
41645 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41646 (RootSizeInBits / Mask.size()) != 64 ||
41647 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41648 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41649 LHS.getOperand(0) != RHS.getOperand(0))
41650 return SDValue();
41651 }
41652
41653 // If that failed and any input is extracted then try to combine as a
41654 // shuffle with the larger type.
41656 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41657 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41658 DAG, DL, Subtarget);
41659}
41660
41661/// Helper entry wrapper to combineX86ShufflesRecursively.
41663 const X86Subtarget &Subtarget) {
41665 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41666 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41667 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41668 SDLoc(Op), Subtarget);
41669}
41670
41671/// Get the PSHUF-style mask from PSHUF node.
41672///
41673/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41674/// PSHUF-style masks that can be reused with such instructions.
41676 MVT VT = N.getSimpleValueType();
41679 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41680 (void)HaveMask;
41681 assert(HaveMask);
41682
41683 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41684 // matter. Check that the upper masks are repeats and remove them.
41685 if (VT.getSizeInBits() > 128) {
41686 int LaneElts = 128 / VT.getScalarSizeInBits();
41687#ifndef NDEBUG
41688 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41689 for (int j = 0; j < LaneElts; ++j)
41690 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41691 "Mask doesn't repeat in high 128-bit lanes!");
41692#endif
41693 Mask.resize(LaneElts);
41694 }
41695
41696 switch (N.getOpcode()) {
41697 case X86ISD::PSHUFD:
41698 return Mask;
41699 case X86ISD::PSHUFLW:
41700 Mask.resize(4);
41701 return Mask;
41702 case X86ISD::PSHUFHW:
41703 Mask.erase(Mask.begin(), Mask.begin() + 4);
41704 for (int &M : Mask)
41705 M -= 4;
41706 return Mask;
41707 default:
41708 llvm_unreachable("No valid shuffle instruction found!");
41709 }
41710}
41711
41712/// Get the expanded blend mask from a BLENDI node.
41713/// For v16i16 nodes, this will splat the repeated i8 mask.
41715 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41716 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41717 APInt Mask = V.getConstantOperandAPInt(2);
41718 if (Mask.getBitWidth() > NumElts)
41719 Mask = Mask.trunc(NumElts);
41720 if (NumElts == 16) {
41721 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41722 Mask = APInt::getSplat(16, Mask);
41723 }
41724 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41725 return Mask;
41726}
41727
41728/// Search for a combinable shuffle across a chain ending in pshufd.
41729///
41730/// We walk up the chain and look for a combinable shuffle, skipping over
41731/// shuffles that we could hoist this shuffle's transformation past without
41732/// altering anything.
41735 const SDLoc &DL,
41736 SelectionDAG &DAG) {
41737 assert(N.getOpcode() == X86ISD::PSHUFD &&
41738 "Called with something other than an x86 128-bit half shuffle!");
41739
41740 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41741 // of the shuffles in the chain so that we can form a fresh chain to replace
41742 // this one.
41744 SDValue V = N.getOperand(0);
41745 for (; V.hasOneUse(); V = V.getOperand(0)) {
41746 switch (V.getOpcode()) {
41747 default:
41748 return SDValue(); // Nothing combined!
41749
41750 case ISD::BITCAST:
41751 // Skip bitcasts as we always know the type for the target specific
41752 // instructions.
41753 continue;
41754
41755 case X86ISD::PSHUFD:
41756 // Found another dword shuffle.
41757 break;
41758
41759 case X86ISD::PSHUFLW:
41760 // Check that the low words (being shuffled) are the identity in the
41761 // dword shuffle, and the high words are self-contained.
41762 if (Mask[0] != 0 || Mask[1] != 1 ||
41763 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41764 return SDValue();
41765
41766 Chain.push_back(V);
41767 continue;
41768
41769 case X86ISD::PSHUFHW:
41770 // Check that the high words (being shuffled) are the identity in the
41771 // dword shuffle, and the low words are self-contained.
41772 if (Mask[2] != 2 || Mask[3] != 3 ||
41773 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41774 return SDValue();
41775
41776 Chain.push_back(V);
41777 continue;
41778
41779 case X86ISD::UNPCKL:
41780 case X86ISD::UNPCKH:
41781 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41782 // shuffle into a preceding word shuffle.
41783 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41784 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41785 return SDValue();
41786
41787 // Search for a half-shuffle which we can combine with.
41788 unsigned CombineOp =
41789 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41790 if (V.getOperand(0) != V.getOperand(1) ||
41791 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41792 return SDValue();
41793 Chain.push_back(V);
41794 V = V.getOperand(0);
41795 do {
41796 switch (V.getOpcode()) {
41797 default:
41798 return SDValue(); // Nothing to combine.
41799
41800 case X86ISD::PSHUFLW:
41801 case X86ISD::PSHUFHW:
41802 if (V.getOpcode() == CombineOp)
41803 break;
41804
41805 Chain.push_back(V);
41806
41807 [[fallthrough]];
41808 case ISD::BITCAST:
41809 V = V.getOperand(0);
41810 continue;
41811 }
41812 break;
41813 } while (V.hasOneUse());
41814 break;
41815 }
41816 // Break out of the loop if we break out of the switch.
41817 break;
41818 }
41819
41820 if (!V.hasOneUse())
41821 // We fell out of the loop without finding a viable combining instruction.
41822 return SDValue();
41823
41824 // Merge this node's mask and our incoming mask.
41826 for (int &M : Mask)
41827 M = VMask[M];
41828 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41829 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41830
41831 // Rebuild the chain around this new shuffle.
41832 while (!Chain.empty()) {
41833 SDValue W = Chain.pop_back_val();
41834
41835 if (V.getValueType() != W.getOperand(0).getValueType())
41836 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41837
41838 switch (W.getOpcode()) {
41839 default:
41840 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41841
41842 case X86ISD::UNPCKL:
41843 case X86ISD::UNPCKH:
41844 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41845 break;
41846
41847 case X86ISD::PSHUFD:
41848 case X86ISD::PSHUFLW:
41849 case X86ISD::PSHUFHW:
41850 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41851 break;
41852 }
41853 }
41854 if (V.getValueType() != N.getValueType())
41855 V = DAG.getBitcast(N.getValueType(), V);
41856
41857 // Return the new chain to replace N.
41858 return V;
41859}
41860
41861// Attempt to commute shufps LHS loads:
41862// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41864 SelectionDAG &DAG) {
41865 // TODO: Add vXf64 support.
41866 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41867 return SDValue();
41868
41869 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41870 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41871 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41872 return SDValue();
41873 SDValue N0 = V.getOperand(0);
41874 SDValue N1 = V.getOperand(1);
41875 unsigned Imm = V.getConstantOperandVal(2);
41876 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41877 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41879 return SDValue();
41880 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41881 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41882 DAG.getTargetConstant(Imm, DL, MVT::i8));
41883 };
41884
41885 switch (N.getOpcode()) {
41886 case X86ISD::VPERMILPI:
41887 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41888 unsigned Imm = N.getConstantOperandVal(1);
41889 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41890 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41891 }
41892 break;
41893 case X86ISD::SHUFP: {
41894 SDValue N0 = N.getOperand(0);
41895 SDValue N1 = N.getOperand(1);
41896 unsigned Imm = N.getConstantOperandVal(2);
41897 if (N0 == N1) {
41898 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41899 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41900 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41901 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41902 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41903 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41904 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41905 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41906 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41907 }
41908 break;
41909 }
41910 }
41911
41912 return SDValue();
41913}
41914
41915// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41916// iff we don't demand the same element index for both X and Y.
41917static SDValue
41919 const APInt &DemandedElts, SelectionDAG &DAG,
41920 const X86Subtarget &Subtarget, const SDLoc &DL) {
41921 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41922 if (!N0.hasOneUse() || !N1.hasOneUse())
41923 return SDValue();
41924
41925 unsigned NumElts = VT.getVectorNumElements();
41928
41929 // See if both operands are shuffles, and that we can scale the shuffle masks
41930 // to the same width as the blend mask.
41931 // TODO: Support SM_SentinelZero?
41932 SmallVector<SDValue, 2> Ops0, Ops1;
41933 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41934 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41935 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41936 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41937 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41938 return SDValue();
41939
41940 // Determine the demanded elts from both permutes.
41941 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41942 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41943 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41944 Demanded1,
41945 /*AllowUndefElts=*/true) ||
41946 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41947 DemandedRHS0, /*AllowUndefElts=*/true) ||
41948 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41949 DemandedRHS1, /*AllowUndefElts=*/true))
41950 return SDValue();
41951
41952 // Confirm that we only use a single operand from both permutes and that we
41953 // don't demand the same index from both.
41954 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41955 DemandedLHS0.intersects(DemandedLHS1))
41956 return SDValue();
41957
41958 // Use the permute demanded elts masks as the new blend mask.
41959 // Create the new permute mask as a blend of the 2 original permute masks.
41960 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41961 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41962 for (unsigned I = 0; I != NumElts; ++I) {
41963 if (Demanded0[I]) {
41964 int M = ScaledMask0[I];
41965 if (0 <= M) {
41966 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41967 "BlendMask demands LHS AND RHS");
41968 NewBlendMask[M] = M;
41969 NewPermuteMask[I] = M;
41970 }
41971 } else if (Demanded1[I]) {
41972 int M = ScaledMask1[I];
41973 if (0 <= M) {
41974 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41975 "BlendMask demands LHS AND RHS");
41976 NewBlendMask[M] = M + NumElts;
41977 NewPermuteMask[I] = M;
41978 }
41979 }
41980 }
41981 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41982 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41983
41984 // v16i16 shuffles can explode in complexity very easily, only accept them if
41985 // the blend mask is the same in the 128-bit subvectors (or can widen to
41986 // v8i32) and the permute can be widened as well.
41987 if (VT == MVT::v16i16) {
41988 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41989 !canWidenShuffleElements(NewBlendMask))
41990 return SDValue();
41991 if (!canWidenShuffleElements(NewPermuteMask))
41992 return SDValue();
41993 }
41994
41995 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41996 // widened to a lane permute (vperm2f128).
41997 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41999 NewPermuteMask) &&
42000 !canScaleShuffleElements(NewPermuteMask, 2))
42001 return SDValue();
42002
42003 SDValue NewBlend =
42004 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
42005 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
42006 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
42007 NewPermuteMask);
42008}
42009
42010// TODO - move this to TLI like isBinOp?
42011static bool isUnaryOp(unsigned Opcode) {
42012 switch (Opcode) {
42013 case ISD::CTLZ:
42014 case ISD::CTTZ:
42015 case ISD::CTPOP:
42016 return true;
42017 }
42018 return false;
42019}
42020
42021// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
42022// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
42024 const SDLoc &DL) {
42025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42026 EVT ShuffleVT = N.getValueType();
42027 unsigned Opc = N.getOpcode();
42028
42029 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
42030 // AllZeros/AllOnes constants are freely shuffled and will peek through
42031 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42032 // merge with target shuffles if it has one use so shuffle combining is
42033 // likely to kick in. Shuffles of splats are expected to be removed.
42034 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42035 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42039 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42040 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42041 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42042 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42043 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42044 };
42045 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42046 // Ensure we only shuffle whole vector src elements, unless its a logical
42047 // binops where we can more aggressively move shuffles from dst to src.
42048 return isLogicOp(BinOp) ||
42049 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42050 };
42051
42052 switch (Opc) {
42053 // Unary and Unary+Permute Shuffles.
42054 case X86ISD::PSHUFB: {
42055 // Don't merge PSHUFB if it contains zero'd elements.
42056 SmallVector<int> Mask;
42058 if (!getTargetShuffleMask(N, false, Ops, Mask))
42059 break;
42060 [[fallthrough]];
42061 }
42062 case X86ISD::VBROADCAST:
42063 case X86ISD::MOVDDUP:
42064 case X86ISD::PSHUFD:
42065 case X86ISD::PSHUFHW:
42066 case X86ISD::PSHUFLW:
42067 case X86ISD::VPERMV:
42068 case X86ISD::VPERMI:
42069 case X86ISD::VPERMILPI: {
42070 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42071 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42072 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42073 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42074 unsigned SrcOpcode = N0.getOpcode();
42075 EVT OpVT = N0.getValueType();
42076 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42079 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42080 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42081 IsMergeableWithShuffle(Op01, FoldShuf)) {
42082 SDValue LHS, RHS;
42083 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42084 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42085 if (Opc == X86ISD::VPERMV) {
42086 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42087 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42088 } else if (N.getNumOperands() == 2) {
42089 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42090 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42091 } else {
42092 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42093 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42094 }
42095 return DAG.getBitcast(ShuffleVT,
42096 DAG.getNode(SrcOpcode, DL, OpVT,
42097 DAG.getBitcast(OpVT, LHS),
42098 DAG.getBitcast(OpVT, RHS)));
42099 }
42100 }
42101 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42102 OpVT.getScalarSizeInBits() ==
42104 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42105 if (Opc == X86ISD::VPERMV)
42106 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42107 else if (N.getNumOperands() == 2)
42108 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42109 else
42110 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42111 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42112 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42113 }
42114 }
42115 break;
42116 }
42117 // Binary and Binary+Permute Shuffles.
42118 case X86ISD::INSERTPS: {
42119 // Don't merge INSERTPS if it contains zero'd elements.
42120 unsigned InsertPSMask = N.getConstantOperandVal(2);
42121 unsigned ZeroMask = InsertPSMask & 0xF;
42122 if (ZeroMask != 0)
42123 break;
42124 [[fallthrough]];
42125 }
42126 case X86ISD::MOVSD:
42127 case X86ISD::MOVSS:
42128 case X86ISD::BLENDI:
42129 case X86ISD::SHUFP:
42130 case X86ISD::UNPCKH:
42131 case X86ISD::UNPCKL: {
42132 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42133 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42134 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42135 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42136 unsigned SrcOpcode = N0.getOpcode();
42137 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42138 N0.getValueType() == N1.getValueType() &&
42139 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42140 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42145 // Ensure the total number of shuffles doesn't increase by folding this
42146 // shuffle through to the source ops.
42147 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42148 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42149 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42150 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42151 SDValue LHS, RHS;
42152 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42153 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42154 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42155 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42156 if (N.getNumOperands() == 3) {
42157 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42158 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42159 } else {
42160 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42161 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42162 }
42163 EVT OpVT = N0.getValueType();
42164 return DAG.getBitcast(ShuffleVT,
42165 DAG.getNode(SrcOpcode, DL, OpVT,
42166 DAG.getBitcast(OpVT, LHS),
42167 DAG.getBitcast(OpVT, RHS)));
42168 }
42169 }
42170 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42171 N0.getValueType() == N1.getValueType() &&
42172 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42173 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42176 SDValue Res;
42177 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42178 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42179 if (N.getNumOperands() == 3) {
42180 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42181 } else {
42182 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42183 }
42184 EVT OpVT = N0.getValueType();
42185 return DAG.getBitcast(
42186 ShuffleVT,
42187 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42188 }
42189 // TODO: We can generalize this for other shuffles/conversions.
42190 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42191 N1.getOpcode() == SrcOpcode &&
42192 N0.getValueType() == N1.getValueType() &&
42193 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42194 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42195 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42196 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42197 EVT OpSrcVT = N0.getOperand(0).getValueType();
42198 EVT OpDstVT = N0.getValueType();
42199 SDValue Res =
42200 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42201 return DAG.getBitcast(ShuffleVT,
42202 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42203 }
42204 }
42205 break;
42206 }
42207 }
42208 return SDValue();
42209}
42210
42211/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42213 SelectionDAG &DAG,
42214 const SDLoc &DL) {
42215 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42216
42217 MVT VT = V.getSimpleValueType();
42218 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42219 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42220 unsigned SrcOpc0 = Src0.getOpcode();
42221 unsigned SrcOpc1 = Src1.getOpcode();
42222 EVT SrcVT0 = Src0.getValueType();
42223 EVT SrcVT1 = Src1.getValueType();
42224
42225 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42226 return SDValue();
42227
42228 switch (SrcOpc0) {
42229 case X86ISD::MOVDDUP: {
42230 SDValue LHS = Src0.getOperand(0);
42231 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42232 SDValue Res =
42233 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42234 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42235 return DAG.getBitcast(VT, Res);
42236 }
42237 case X86ISD::VPERMILPI:
42238 // TODO: Handle v4f64 permutes with different low/high lane masks.
42239 if (SrcVT0 == MVT::v4f64) {
42240 uint64_t Mask = Src0.getConstantOperandVal(1);
42241 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42242 break;
42243 }
42244 [[fallthrough]];
42245 case X86ISD::VSHLI:
42246 case X86ISD::VSRLI:
42247 case X86ISD::VSRAI:
42248 case X86ISD::PSHUFD:
42249 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42250 SDValue LHS = Src0.getOperand(0);
42251 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42252 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42253 V.getOperand(2));
42254 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42255 return DAG.getBitcast(VT, Res);
42256 }
42257 break;
42258 }
42259
42260 return SDValue();
42261}
42262
42263/// Try to combine x86 target specific shuffles.
42265 SelectionDAG &DAG,
42267 const X86Subtarget &Subtarget) {
42268 using namespace SDPatternMatch;
42269
42270 MVT VT = N.getSimpleValueType();
42271 unsigned NumElts = VT.getVectorNumElements();
42273 unsigned Opcode = N.getOpcode();
42274 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42275
42276 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42277 return R;
42278
42279 // Handle specific target shuffles.
42280 switch (Opcode) {
42281 case X86ISD::MOVDDUP: {
42282 SDValue Src = N.getOperand(0);
42283 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42284 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42285 ISD::isNormalLoad(Src.getNode())) {
42286 LoadSDNode *LN = cast<LoadSDNode>(Src);
42287 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42288 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42289 DCI.CombineTo(N.getNode(), Movddup);
42290 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42292 return N; // Return N so it doesn't get rechecked!
42293 }
42294 }
42295
42296 return SDValue();
42297 }
42298 case X86ISD::VBROADCAST: {
42299 SDValue Src = N.getOperand(0);
42300 SDValue BC = peekThroughBitcasts(Src);
42301 EVT SrcVT = Src.getValueType();
42302 EVT BCVT = BC.getValueType();
42303
42304 // If broadcasting from another shuffle, attempt to simplify it.
42305 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42306 if (isTargetShuffle(BC.getOpcode()) &&
42307 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42308 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42309 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42311 for (unsigned i = 0; i != Scale; ++i)
42312 DemandedMask[i] = i;
42314 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42315 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42316 /*AllowVariableCrossLaneMask=*/true,
42317 /*AllowVariablePerLaneMask=*/true,
42318 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42319 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42320 DAG.getBitcast(SrcVT, Res));
42321 }
42322
42323 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42324 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42325 if (Src.getOpcode() == ISD::BITCAST &&
42326 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42327 TLI.isTypeLegal(BCVT) &&
42329 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42330 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42332 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42333 }
42334
42335 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42336 // If we're re-broadcasting a smaller type then broadcast with that type and
42337 // bitcast.
42338 // TODO: Do this for any splat?
42339 if (Src.getOpcode() == ISD::BITCAST &&
42340 (BC.getOpcode() == X86ISD::VBROADCAST ||
42342 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42343 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42344 MVT NewVT =
42346 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42347 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42348 }
42349
42350 // Reduce broadcast source vector to lowest 128-bits.
42351 if (SrcVT.getSizeInBits() > 128)
42352 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42353 extract128BitVector(Src, 0, DAG, DL));
42354
42355 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42356 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42357 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42358 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42359
42360 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42361 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42362 isNullConstant(Src.getOperand(1)) &&
42363 Src.getValueType() ==
42364 Src.getOperand(0).getValueType().getScalarType() &&
42365 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42366 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42367
42368 // Share broadcast with the longest vector and extract low subvector (free).
42369 // Ensure the same SDValue from the SDNode use is being used.
42370 for (SDNode *User : Src->users())
42371 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42372 Src == User->getOperand(0) &&
42373 User->getValueSizeInBits(0).getFixedValue() >
42374 VT.getFixedSizeInBits()) {
42375 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42376 VT.getSizeInBits());
42377 }
42378
42379 // vbroadcast(scalarload X) -> vbroadcast_load X
42380 // For float loads, extract other uses of the scalar from the broadcast.
42381 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42382 ISD::isNormalLoad(Src.getNode())) {
42383 LoadSDNode *LN = cast<LoadSDNode>(Src);
42384 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42385 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42386 SDValue BcastLd =
42388 LN->getMemoryVT(), LN->getMemOperand());
42389 // If the load value is used only by N, replace it via CombineTo N.
42390 bool NoReplaceExtract = Src.hasOneUse();
42391 DCI.CombineTo(N.getNode(), BcastLd);
42392 if (NoReplaceExtract) {
42393 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42395 } else {
42396 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42397 DAG.getVectorIdxConstant(0, DL));
42398 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42399 }
42400 return N; // Return N so it doesn't get rechecked!
42401 }
42402
42403 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42404 // i16. So shrink it ourselves if we can make a broadcast_load.
42405 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42406 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42407 assert(Subtarget.hasAVX2() && "Expected AVX2");
42408 SDValue TruncIn = Src.getOperand(0);
42409
42410 // If this is a truncate of a non extending load we can just narrow it to
42411 // use a broadcast_load.
42412 if (ISD::isNormalLoad(TruncIn.getNode())) {
42413 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42414 // Unless its volatile or atomic.
42415 if (LN->isSimple()) {
42416 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42417 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42418 SDValue BcastLd = DAG.getMemIntrinsicNode(
42419 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42420 LN->getPointerInfo(), LN->getBaseAlign(),
42421 LN->getMemOperand()->getFlags());
42422 DCI.CombineTo(N.getNode(), BcastLd);
42423 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42424 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42425 return N; // Return N so it doesn't get rechecked!
42426 }
42427 }
42428
42429 // If this is a truncate of an i16 extload, we can directly replace it.
42430 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42431 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42432 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42433 if (LN->getMemoryVT().getSizeInBits() == 16) {
42434 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42435 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42436 SDValue BcastLd =
42438 LN->getMemoryVT(), LN->getMemOperand());
42439 DCI.CombineTo(N.getNode(), BcastLd);
42440 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42441 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42442 return N; // Return N so it doesn't get rechecked!
42443 }
42444 }
42445
42446 // If this is a truncate of load that has been shifted right, we can
42447 // offset the pointer and use a narrower load.
42448 if (TruncIn.getOpcode() == ISD::SRL &&
42449 TruncIn.getOperand(0).hasOneUse() &&
42450 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42451 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42452 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42453 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42454 // Make sure the shift amount and the load size are divisible by 16.
42455 // Don't do this if the load is volatile or atomic.
42456 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42457 LN->isSimple()) {
42458 unsigned Offset = ShiftAmt / 8;
42459 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42462 SDValue Ops[] = { LN->getChain(), Ptr };
42463 SDValue BcastLd = DAG.getMemIntrinsicNode(
42464 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42466 LN->getMemOperand()->getFlags());
42467 DCI.CombineTo(N.getNode(), BcastLd);
42468 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42469 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42470 return N; // Return N so it doesn't get rechecked!
42471 }
42472 }
42473 }
42474
42475 // vbroadcast(vzload X) -> vbroadcast_load X
42476 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42478 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42479 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42480 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42481 SDValue BcastLd =
42483 LN->getMemoryVT(), LN->getMemOperand());
42484 DCI.CombineTo(N.getNode(), BcastLd);
42485 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42487 return N; // Return N so it doesn't get rechecked!
42488 }
42489 }
42490
42491 // vbroadcast(vector load X) -> vbroadcast_load
42492 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42493 LoadSDNode *LN = cast<LoadSDNode>(Src);
42494 // Unless the load is volatile or atomic.
42495 if (LN->isSimple()) {
42496 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42497 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42498 SDValue BcastLd = DAG.getMemIntrinsicNode(
42500 LN->getPointerInfo(), LN->getBaseAlign(),
42501 LN->getMemOperand()->getFlags());
42502 DCI.CombineTo(N.getNode(), BcastLd);
42503 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42505 return N; // Return N so it doesn't get rechecked!
42506 }
42507 }
42508
42509 return SDValue();
42510 }
42511 case X86ISD::VZEXT_MOVL: {
42512 SDValue N0 = N.getOperand(0);
42513
42514 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42515 // Zeroing out the upper elements means we're just shifting a zero value.
42516 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42517 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42518 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42519 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42520 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42521 if (N0.hasOneUse())
42522 return DAG.getNode(
42523 N0.getOpcode(), DL, VT,
42524 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42525 N0.getOperand(1));
42526 }
42527
42528 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42529 // the load is volatile.
42530 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42531 auto *LN = cast<LoadSDNode>(N0);
42532 if (SDValue VZLoad =
42533 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42534 DCI.CombineTo(N.getNode(), VZLoad);
42535 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42537 return N;
42538 }
42539 }
42540
42541 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42542 // and can just use a VZEXT_LOAD.
42543 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42544 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42545 auto *LN = cast<MemSDNode>(N0);
42546 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42547 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42548 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42549 SDValue VZLoad =
42551 LN->getMemoryVT(), LN->getMemOperand());
42552 DCI.CombineTo(N.getNode(), VZLoad);
42553 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42555 return N;
42556 }
42557 }
42558
42559 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42560 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42561 // if the upper bits of the i64 are zero.
42562 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42563 N0.getOperand(0).hasOneUse() &&
42564 N0.getOperand(0).getValueType() == MVT::i64) {
42565 SDValue In = N0.getOperand(0);
42566 APInt Mask = APInt::getHighBitsSet(64, 32);
42567 if (DAG.MaskedValueIsZero(In, Mask)) {
42568 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42569 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42570 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42571 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42572 return DAG.getBitcast(VT, Movl);
42573 }
42574 }
42575
42576 // Load a scalar integer constant directly to XMM instead of transferring an
42577 // immediate value from GPR.
42578 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42579 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42580 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42581 // Create a vector constant - scalar constant followed by zeros.
42582 EVT ScalarVT = N0.getOperand(0).getValueType();
42583 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42584 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42585 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42586 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42587
42588 // Load the vector constant from constant pool.
42589 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42590 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42591 MachinePointerInfo MPI =
42593 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42594 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42596 }
42597 }
42598
42599 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42600 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42601 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42602 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42603 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42605
42606 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42607 isNullConstant(V.getOperand(2))) {
42608 SDValue In = V.getOperand(1);
42610 In.getValueSizeInBits() /
42611 VT.getScalarSizeInBits());
42612 In = DAG.getBitcast(SubVT, In);
42613 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42614 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42615 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42616 V.getOperand(2));
42617 }
42618 }
42619
42620 return SDValue();
42621 }
42622 case X86ISD::BLENDI: {
42623 SDValue N0 = N.getOperand(0);
42624 SDValue N1 = N.getOperand(1);
42625 unsigned EltBits = VT.getScalarSizeInBits();
42626
42627 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42628 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42629 // TODO: Handle MVT::v16i16 repeated blend mask.
42630 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42631 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42632 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42633 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42634 unsigned NewSize = SrcVT.getVectorNumElements();
42635 APInt BlendMask = getBLENDIBlendMask(N);
42636 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42637 return DAG.getBitcast(
42638 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42639 N1.getOperand(0),
42640 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42641 DL, MVT::i8)));
42642 }
42643 }
42644 // Share PSHUFB masks:
42645 // blend(pshufb(x,m1),pshufb(y,m2))
42646 // --> m3 = blend(m1,m2)
42647 // blend(pshufb(x,m3),pshufb(y,m3))
42648 if (N0.hasOneUse() && N1.hasOneUse()) {
42649 SmallVector<int> Mask, ByteMask;
42653 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42654 RHS.getOpcode() == X86ISD::PSHUFB &&
42655 LHS.getOperand(1) != RHS.getOperand(1) &&
42656 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42657 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42658 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42660 "BLENDI decode mismatch");
42661 MVT ShufVT = LHS.getSimpleValueType();
42662 SDValue MaskLHS = LHS.getOperand(1);
42663 SDValue MaskRHS = RHS.getOperand(1);
42664 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42666 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42667 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42668 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42669 LHS.getOperand(0), NewMask);
42670 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42671 RHS.getOperand(0), NewMask);
42672 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42673 DAG.getBitcast(VT, NewLHS),
42674 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42675 }
42676 }
42677 }
42678 }
42679 return SDValue();
42680 }
42681 case X86ISD::SHUFP: {
42682 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42683 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42684 // TODO: Support types other than v4f32.
42685 if (VT == MVT::v4f32) {
42686 bool Updated = false;
42687 SmallVector<int> Mask;
42689 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42690 for (int i = 0; i != 2; ++i) {
42691 SmallVector<SDValue> SubOps;
42692 SmallVector<int> SubMask, SubScaledMask;
42694 // TODO: Scaling might be easier if we specify the demanded elts.
42695 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42696 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42697 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42698 int Ofs = i * 2;
42699 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42700 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42701 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42702 Updated = true;
42703 }
42704 }
42705 }
42706 if (Updated) {
42707 for (int &M : Mask)
42708 M %= 4;
42709 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42710 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42711 }
42712 }
42713 return SDValue();
42714 }
42715 case X86ISD::VPERMI: {
42716 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42717 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42718 SDValue N0 = N.getOperand(0);
42719 SDValue N1 = N.getOperand(1);
42720 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42721 if (N0.getOpcode() == ISD::BITCAST &&
42722 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42723 SDValue Src = N0.getOperand(0);
42724 EVT SrcVT = Src.getValueType();
42725 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42726 return DAG.getBitcast(VT, Res);
42727 }
42728 return SDValue();
42729 }
42730 case X86ISD::SHUF128: {
42731 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42732 // see if we can peek through and access the subvector directly.
42733 if (VT.is512BitVector()) {
42734 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42735 // the upper subvector is used.
42736 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42737 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42738 uint64_t Mask = N->getConstantOperandVal(2);
42739 SmallVector<SDValue> LHSOps, RHSOps;
42740 SDValue NewLHS, NewRHS;
42741 if ((Mask & 0x0A) == 0x0A &&
42742 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42743 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42744 Mask &= ~0x0A;
42745 }
42746 if ((Mask & 0xA0) == 0xA0 &&
42747 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42748 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42749 Mask &= ~0xA0;
42750 }
42751 if (NewLHS || NewRHS)
42752 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42753 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42754 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42755 DAG.getTargetConstant(Mask, DL, MVT::i8));
42756 }
42757 return SDValue();
42758 }
42759 case X86ISD::VPERM2X128: {
42760 SDValue LHS = N->getOperand(0);
42761 SDValue RHS = N->getOperand(1);
42762 unsigned Imm = N.getConstantOperandVal(2) & 255;
42763
42764 // Canonicalize unary/repeated operands to LHS.
42765 if (LHS.isUndef() && !RHS.isUndef())
42766 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42767 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42768 if (LHS == RHS)
42769 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42770 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42771
42772 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42773 if (LHS.getOpcode() == ISD::BITCAST &&
42774 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42775 EVT SrcVT = LHS.getOperand(0).getValueType();
42776 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42777 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42778 DAG.getBitcast(SrcVT, LHS),
42779 DAG.getBitcast(SrcVT, RHS),
42780 N->getOperand(2)));
42781 }
42782 }
42783
42784 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42786 return Res;
42787
42788 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42789 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42790 auto FindSubVector128 = [&](unsigned Idx) {
42791 if (Idx > 3)
42792 return SDValue();
42793 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42794 SmallVector<SDValue> SubOps;
42795 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42796 return SubOps[Idx & 1];
42797 unsigned NumElts = Src.getValueType().getVectorNumElements();
42798 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42799 Src.getOperand(1).getValueSizeInBits() == 128 &&
42800 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42801 return Src.getOperand(1);
42802 }
42803 return SDValue();
42804 };
42805 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42806 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42807 MVT SubVT = VT.getHalfNumVectorElementsVT();
42808 SubLo = DAG.getBitcast(SubVT, SubLo);
42809 SubHi = DAG.getBitcast(SubVT, SubHi);
42810 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42811 }
42812 }
42813
42814 // Attempt to match VBROADCAST*128 subvector broadcast load.
42815 if (RHS.isUndef()) {
42817 DecodeVPERM2X128Mask(4, Imm, Mask);
42818 if (isUndefOrInRange(Mask, 0, 4)) {
42819 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42820 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42821 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42822 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42823 MVT MemVT = VT.getHalfNumVectorElementsVT();
42824 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42826 cast<LoadSDNode>(LHS), Ofs, DAG);
42827 }
42828 }
42829 }
42830
42831 return SDValue();
42832 }
42833 case X86ISD::PSHUFD:
42834 case X86ISD::PSHUFLW:
42835 case X86ISD::PSHUFHW: {
42836 SDValue N0 = N.getOperand(0);
42837 SDValue N1 = N.getOperand(1);
42838 if (N0->hasOneUse()) {
42840 switch (V.getOpcode()) {
42841 case X86ISD::VSHL:
42842 case X86ISD::VSRL:
42843 case X86ISD::VSRA:
42844 case X86ISD::VSHLI:
42845 case X86ISD::VSRLI:
42846 case X86ISD::VSRAI:
42847 case X86ISD::VROTLI:
42848 case X86ISD::VROTRI: {
42849 MVT InnerVT = V.getSimpleValueType();
42850 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42851 SDValue Res = DAG.getNode(Opcode, DL, VT,
42852 DAG.getBitcast(VT, V.getOperand(0)), N1);
42853 Res = DAG.getBitcast(InnerVT, Res);
42854 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42855 return DAG.getBitcast(VT, Res);
42856 }
42857 break;
42858 }
42859 }
42860 }
42861
42862 Mask = getPSHUFShuffleMask(N);
42863 assert(Mask.size() == 4);
42864 break;
42865 }
42866 case X86ISD::MOVSD:
42867 case X86ISD::MOVSH:
42868 case X86ISD::MOVSS: {
42869 SDValue N0 = N.getOperand(0);
42870 SDValue N1 = N.getOperand(1);
42871
42872 // Canonicalize scalar FPOps:
42873 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42874 // If commutable, allow OP(N1[0], N0[0]).
42875 unsigned Opcode1 = N1.getOpcode();
42876 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42877 Opcode1 == ISD::FDIV) {
42878 SDValue N10 = N1.getOperand(0);
42879 SDValue N11 = N1.getOperand(1);
42880 if (N10 == N0 ||
42881 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42882 if (N10 != N0)
42883 std::swap(N10, N11);
42884 MVT SVT = VT.getVectorElementType();
42885 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42886 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42887 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42888 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42889 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42890 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42891 }
42892 }
42893
42894 return SDValue();
42895 }
42896 case X86ISD::INSERTPS: {
42897 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42898 SDValue Op0 = N.getOperand(0);
42899 SDValue Op1 = N.getOperand(1);
42900 unsigned InsertPSMask = N.getConstantOperandVal(2);
42901 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42902 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42903 unsigned ZeroMask = InsertPSMask & 0xF;
42904
42905 // If we zero out all elements from Op0 then we don't need to reference it.
42906 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42907 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42908 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42909
42910 // If we zero out the element from Op1 then we don't need to reference it.
42911 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42912 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42913 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42914
42915 // Attempt to merge insertps Op1 with an inner target shuffle node.
42916 SmallVector<int, 8> TargetMask1;
42918 APInt KnownUndef1, KnownZero1;
42919 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42920 KnownZero1)) {
42921 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42922 // Zero/UNDEF insertion - zero out element and remove dependency.
42923 InsertPSMask |= (1u << DstIdx);
42924 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42925 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42926 }
42927 // Update insertps mask srcidx and reference the source input directly.
42928 int M = TargetMask1[SrcIdx];
42929 assert(0 <= M && M < 8 && "Shuffle index out of range");
42930 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42931 Op1 = Ops1[M < 4 ? 0 : 1];
42932 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42933 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42934 }
42935
42936 // Attempt to merge insertps Op0 with an inner target shuffle node.
42937 SmallVector<int, 8> TargetMask0;
42939 APInt KnownUndef0, KnownZero0;
42940 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42941 KnownZero0)) {
42942 bool Updated = false;
42943 bool UseInput00 = false;
42944 bool UseInput01 = false;
42945 for (int i = 0; i != 4; ++i) {
42946 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42947 // No change if element is already zero or the inserted element.
42948 continue;
42949 }
42950
42951 if (KnownUndef0[i] || KnownZero0[i]) {
42952 // If the target mask is undef/zero then we must zero the element.
42953 InsertPSMask |= (1u << i);
42954 Updated = true;
42955 continue;
42956 }
42957
42958 // The input vector element must be inline.
42959 int M = TargetMask0[i];
42960 if (M != i && M != (i + 4))
42961 return SDValue();
42962
42963 // Determine which inputs of the target shuffle we're using.
42964 UseInput00 |= (0 <= M && M < 4);
42965 UseInput01 |= (4 <= M);
42966 }
42967
42968 // If we're not using both inputs of the target shuffle then use the
42969 // referenced input directly.
42970 if (UseInput00 && !UseInput01) {
42971 Updated = true;
42972 Op0 = Ops0[0];
42973 } else if (!UseInput00 && UseInput01) {
42974 Updated = true;
42975 Op0 = Ops0[1];
42976 }
42977
42978 if (Updated)
42979 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42980 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42981 }
42982
42983 // If we're inserting an element from a vbroadcast load, fold the
42984 // load into the X86insertps instruction. We need to convert the scalar
42985 // load to a vector and clear the source lane of the INSERTPS control.
42986 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42987 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42988 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42989 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42990 MemIntr->getBasePtr(),
42991 MemIntr->getMemOperand());
42992 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42994 Load),
42995 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42996 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42997 return Insert;
42998 }
42999 }
43000
43001 return SDValue();
43002 }
43003 case X86ISD::VPERMV: {
43004 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
43006 SmallVector<SDValue, 2> SrcOps, SubOps;
43007 SDValue Src = peekThroughBitcasts(N.getOperand(1));
43008 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
43009 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
43010 collectConcatOps(Src.getNode(), SubOps, DAG)) {
43011 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43012 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
43013 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
43014 "Unexpected split ops");
43015 // Bail if we were permuting a widened vector.
43016 if (SubOps[1].isUndef() &&
43017 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
43018 return SDValue();
43019 // Bail if any subops would have folded into the concat.
43020 if (any_of(SubOps, isShuffleFoldableLoad))
43021 return SDValue();
43022 // Concat 4x128 back to 2x256.
43023 if (SubOps.size() == 4) {
43024 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
43025 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
43026 }
43027 // Convert mask to 2 operand shuffle.
43028 int HalfElts = NumElts / 2;
43029 for (int &M : Mask)
43030 M += M >= HalfElts ? HalfElts : 0;
43031 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43032 VT.getSizeInBits());
43033 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43034 VT.getSizeInBits());
43035 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43036 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43037 }
43038 return SDValue();
43039 }
43040 case X86ISD::VPERMV3: {
43041 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43042 bool CanConcat = VT.is128BitVector() ||
43043 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43046 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43047 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43048 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43049 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43050 // Canonicalize to VPERMV if both sources are the same.
43051 if (V1 == V2) {
43052 for (int &M : Mask)
43053 M = (M < 0 ? M : (M & (NumElts - 1)));
43054 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43055 DAG.getUNDEF(VT), Subtarget, DAG);
43056 }
43057 // If sources are half width, then concat and use VPERMV with adjusted
43058 // mask.
43059 SDValue Ops[2];
43060 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43061 if (sd_match(V1,
43063 sd_match(V2,
43065 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43066 if (SDValue ConcatSrc =
43067 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43068 for (int &M : Mask)
43069 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43070 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43071 DAG.getUNDEF(VT), Subtarget, DAG);
43072 }
43073 }
43074 // Commute foldable source to the RHS.
43075 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43076 !isShuffleFoldableLoad(N.getOperand(2))) {
43078 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43079 N.getOperand(0), Subtarget, DAG);
43080 }
43081 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43082 // freely concatenated, with a commuted shuffle mask.
43083 if (CanConcat) {
43084 if (SDValue ConcatSrc = combineConcatVectorOps(
43085 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43086 Subtarget)) {
43088 Mask.append(NumElts, SM_SentinelUndef);
43089 SDValue Perm =
43090 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43091 DAG.getUNDEF(WideVT), Subtarget, DAG);
43092 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43093 DAG.getVectorIdxConstant(0, DL));
43094 }
43095 }
43096 }
43097 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43098 // freely concatenated.
43099 if (CanConcat) {
43100 if (SDValue ConcatSrc = combineConcatVectorOps(
43101 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43102 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43103 DL, WideVT.getSizeInBits());
43104 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43105 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43106 DAG.getVectorIdxConstant(0, DL));
43107 }
43108 }
43109 return SDValue();
43110 }
43111 default:
43112 return SDValue();
43113 }
43114
43115 // Nuke no-op shuffles that show up after combining.
43116 if (isNoopShuffleMask(Mask))
43117 return N.getOperand(0);
43118
43119 // Look for simplifications involving one or two shuffle instructions.
43120 SDValue V = N.getOperand(0);
43121 switch (N.getOpcode()) {
43122 default:
43123 break;
43124 case X86ISD::PSHUFLW:
43125 case X86ISD::PSHUFHW:
43126 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43127
43128 // See if this reduces to a PSHUFD which is no more expensive and can
43129 // combine with more operations. Note that it has to at least flip the
43130 // dwords as otherwise it would have been removed as a no-op.
43131 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43132 int DMask[] = {0, 1, 2, 3};
43133 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43134 DMask[DOffset + 0] = DOffset + 1;
43135 DMask[DOffset + 1] = DOffset + 0;
43136 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43137 V = DAG.getBitcast(DVT, V);
43138 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43139 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43140 return DAG.getBitcast(VT, V);
43141 }
43142
43143 // Look for shuffle patterns which can be implemented as a single unpack.
43144 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43145 // only works when we have a PSHUFD followed by two half-shuffles.
43146 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43147 (V.getOpcode() == X86ISD::PSHUFLW ||
43148 V.getOpcode() == X86ISD::PSHUFHW) &&
43149 V.getOpcode() != N.getOpcode() &&
43150 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43151 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43152 if (D.getOpcode() == X86ISD::PSHUFD) {
43155 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43156 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43157 int WordMask[8];
43158 for (int i = 0; i < 4; ++i) {
43159 WordMask[i + NOffset] = Mask[i] + NOffset;
43160 WordMask[i + VOffset] = VMask[i] + VOffset;
43161 }
43162 // Map the word mask through the DWord mask.
43163 int MappedMask[8];
43164 for (int i = 0; i < 8; ++i)
43165 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43166 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43167 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43168 // We can replace all three shuffles with an unpack.
43169 V = DAG.getBitcast(VT, D.getOperand(0));
43170 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43172 DL, VT, V, V);
43173 }
43174 }
43175 }
43176
43177 break;
43178
43179 case X86ISD::PSHUFD:
43180 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43181 return NewN;
43182
43183 break;
43184 }
43185
43186 return SDValue();
43187}
43188
43189/// Checks if the shuffle mask takes subsequent elements
43190/// alternately from two vectors.
43191/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43192static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43193
43194 int ParitySrc[2] = {-1, -1};
43195 unsigned Size = Mask.size();
43196 for (unsigned i = 0; i != Size; ++i) {
43197 int M = Mask[i];
43198 if (M < 0)
43199 continue;
43200
43201 // Make sure we are using the matching element from the input.
43202 if ((M % Size) != i)
43203 return false;
43204
43205 // Make sure we use the same input for all elements of the same parity.
43206 int Src = M / Size;
43207 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43208 return false;
43209 ParitySrc[i % 2] = Src;
43210 }
43211
43212 // Make sure each input is used.
43213 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43214 return false;
43215
43216 Op0Even = ParitySrc[0] == 0;
43217 return true;
43218}
43219
43220/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43221/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43222/// are written to the parameters \p Opnd0 and \p Opnd1.
43223///
43224/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43225/// so it is easier to generically match. We also insert dummy vector shuffle
43226/// nodes for the operands which explicitly discard the lanes which are unused
43227/// by this operation to try to flow through the rest of the combiner
43228/// the fact that they're unused.
43229static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43230 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43231 bool &IsSubAdd, bool &HasAllowContract) {
43232
43233 EVT VT = N->getValueType(0);
43234 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43235 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43237 return false;
43238
43239 // We only handle target-independent shuffles.
43240 // FIXME: It would be easy and harmless to use the target shuffle mask
43241 // extraction tool to support more.
43242 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43243 return false;
43244
43245 SDValue V1 = N->getOperand(0);
43246 SDValue V2 = N->getOperand(1);
43247
43248 // Make sure we have an FADD and an FSUB.
43249 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43250 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43251 V1.getOpcode() == V2.getOpcode())
43252 return false;
43253
43254 // If there are other uses of these operations we can't fold them.
43255 if (!V1->hasOneUse() || !V2->hasOneUse())
43256 return false;
43257
43258 // Ensure that both operations have the same operands. Note that we can
43259 // commute the FADD operands.
43260 SDValue LHS, RHS;
43261 if (V1.getOpcode() == ISD::FSUB) {
43262 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43263 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43264 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43265 return false;
43266 } else {
43267 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43268 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43269 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43270 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43271 return false;
43272 }
43273
43274 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43275 bool Op0Even;
43276 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43277 return false;
43278
43279 // It's a subadd if the vector in the even parity is an FADD.
43280 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43281 : V2->getOpcode() == ISD::FADD;
43282 HasAllowContract =
43284
43285 Opnd0 = LHS;
43286 Opnd1 = RHS;
43287 return true;
43288}
43289
43290/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43292 const X86Subtarget &Subtarget,
43293 SelectionDAG &DAG) {
43294 // We only handle target-independent shuffles.
43295 // FIXME: It would be easy and harmless to use the target shuffle mask
43296 // extraction tool to support more.
43297 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43298 return SDValue();
43299
43300 MVT VT = N->getSimpleValueType(0);
43301 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43302 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43303 return SDValue();
43304
43305 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43306 SDValue Op0 = N->getOperand(0);
43307 SDValue Op1 = N->getOperand(1);
43308 SDValue FMAdd = Op0, FMSub = Op1;
43309 if (FMSub.getOpcode() != X86ISD::FMSUB)
43310 std::swap(FMAdd, FMSub);
43311
43312 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43313 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43314 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43315 FMAdd.getOperand(2) != FMSub.getOperand(2))
43316 return SDValue();
43317
43318 // Check for correct shuffle mask.
43319 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43320 bool Op0Even;
43321 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43322 return SDValue();
43323
43324 // FMAddSub takes zeroth operand from FMSub node.
43325 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43326 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43327 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43328 FMAdd.getOperand(2));
43329}
43330
43331/// Try to combine a shuffle into a target-specific add-sub or
43332/// mul-add-sub node.
43334 const X86Subtarget &Subtarget,
43335 SelectionDAG &DAG) {
43336 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43337 return V;
43338
43339 SDValue Opnd0, Opnd1;
43340 bool IsSubAdd;
43341 bool HasAllowContract;
43342 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43343 HasAllowContract))
43344 return SDValue();
43345
43346 MVT VT = N->getSimpleValueType(0);
43347
43348 // Try to generate X86ISD::FMADDSUB node here.
43349 SDValue Opnd2;
43350 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43351 HasAllowContract)) {
43352 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43353 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43354 }
43355
43356 if (IsSubAdd)
43357 return SDValue();
43358
43359 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43360 // the ADDSUB idiom has been successfully recognized. There are no known
43361 // X86 targets with 512-bit ADDSUB instructions!
43362 if (VT.is512BitVector())
43363 return SDValue();
43364
43365 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43366 // the ADDSUB idiom has been successfully recognized. There are no known
43367 // X86 targets with FP16 ADDSUB instructions!
43368 if (VT.getVectorElementType() == MVT::f16)
43369 return SDValue();
43370
43371 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43372}
43373
43374/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43375/// low half of each source vector and does not set any high half elements in
43376/// the destination vector, narrow the shuffle to half its original size.
43378 EVT VT = Shuf->getValueType(0);
43379 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43380 return SDValue();
43381 if (!VT.is256BitVector() && !VT.is512BitVector())
43382 return SDValue();
43383
43384 // See if we can ignore all of the high elements of the shuffle.
43385 ArrayRef<int> Mask = Shuf->getMask();
43386 if (!isUndefUpperHalf(Mask))
43387 return SDValue();
43388
43389 // Check if the shuffle mask accesses only the low half of each input vector
43390 // (half-index output is 0 or 2).
43391 int HalfIdx1, HalfIdx2;
43392 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43393 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43394 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43395 return SDValue();
43396
43397 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43398 // The trick is knowing that all of the insert/extract are actually free
43399 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43400 // of narrow inputs into a narrow output, and that is always cheaper than
43401 // the wide shuffle that we started with.
43402 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43403 Shuf->getOperand(1), HalfMask, HalfIdx1,
43404 HalfIdx2, false, DAG, /*UseConcat*/ true);
43405}
43406
43409 const X86Subtarget &Subtarget) {
43410 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43411 if (SDValue V = narrowShuffle(Shuf, DAG))
43412 return V;
43413
43414 // If we have legalized the vector types, look for blends of FADD and FSUB
43415 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43416 SDLoc dl(N);
43417 EVT VT = N->getValueType(0);
43418 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43419 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43420 if (SDValue AddSub =
43421 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43422 return AddSub;
43423
43424 // Attempt to combine into a vector load/broadcast.
43426 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43427 return LD;
43428
43429 if (isTargetShuffle(N->getOpcode())) {
43430 SDValue Op(N, 0);
43431 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43432 return Shuffle;
43433
43434 // Try recursively combining arbitrary sequences of x86 shuffle
43435 // instructions into higher-order shuffles. We do this after combining
43436 // specific PSHUF instruction sequences into their minimal form so that we
43437 // can evaluate how many specialized shuffle instructions are involved in
43438 // a particular chain.
43439 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43440 return Res;
43441
43442 // Simplify source operands based on shuffle mask.
43443 // TODO - merge this into combineX86ShufflesRecursively.
43444 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43445 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43446 return SDValue(N, 0);
43447
43448 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43449 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43450 // Perform this after other shuffle combines to allow inner shuffles to be
43451 // combined away first.
43452 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43453 return BinOp;
43454 }
43455
43456 return SDValue();
43457}
43458
43459// Simplify variable target shuffle masks based on the demanded elements.
43460// TODO: Handle DemandedBits in mask indices as well?
43462 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43463 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43464 // If we're demanding all elements don't bother trying to simplify the mask.
43465 unsigned NumElts = DemandedElts.getBitWidth();
43466 if (DemandedElts.isAllOnes())
43467 return false;
43468
43469 SDValue Mask = Op.getOperand(MaskIndex);
43470 if (!Mask.hasOneUse())
43471 return false;
43472
43473 // Attempt to generically simplify the variable shuffle mask.
43474 APInt MaskUndef, MaskZero;
43475 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43476 Depth + 1))
43477 return true;
43478
43479 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43480 // TODO: Support other types from getTargetShuffleMaskIndices?
43482 EVT BCVT = BC.getValueType();
43483 auto *Load = dyn_cast<LoadSDNode>(BC);
43484 if (!Load || !Load->getBasePtr().hasOneUse())
43485 return false;
43486
43487 const Constant *C = getTargetConstantFromNode(Load);
43488 if (!C)
43489 return false;
43490
43491 Type *CTy = C->getType();
43492 if (!CTy->isVectorTy() ||
43493 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43494 return false;
43495
43496 // Handle scaling for i64 elements on 32-bit targets.
43497 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43498 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43499 return false;
43500 unsigned Scale = NumCstElts / NumElts;
43501
43502 // Simplify mask if we have an undemanded element that is not undef.
43503 bool Simplified = false;
43504 SmallVector<Constant *, 32> ConstVecOps;
43505 for (unsigned i = 0; i != NumCstElts; ++i) {
43506 Constant *Elt = C->getAggregateElement(i);
43507 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43508 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43509 Simplified = true;
43510 continue;
43511 }
43512 ConstVecOps.push_back(Elt);
43513 }
43514 if (!Simplified)
43515 return false;
43516
43517 // Generate new constant pool entry + legalize immediately for the load.
43518 SDLoc DL(Op);
43519 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43520 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43521 SDValue NewMask = TLO.DAG.getLoad(
43522 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43524 Load->getAlign());
43525 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43526}
43527
43529 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43530 TargetLoweringOpt &TLO, unsigned Depth) const {
43531 int NumElts = DemandedElts.getBitWidth();
43532 unsigned Opc = Op.getOpcode();
43533 EVT VT = Op.getValueType();
43534
43535 // Handle special case opcodes.
43536 switch (Opc) {
43537 case X86ISD::PMULDQ:
43538 case X86ISD::PMULUDQ: {
43539 APInt LHSUndef, LHSZero;
43540 APInt RHSUndef, RHSZero;
43541 SDValue LHS = Op.getOperand(0);
43542 SDValue RHS = Op.getOperand(1);
43543 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43544 Depth + 1))
43545 return true;
43546 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43547 Depth + 1))
43548 return true;
43549 // Multiply by zero.
43550 KnownZero = LHSZero | RHSZero;
43551 break;
43552 }
43553 case X86ISD::VPMADDUBSW:
43554 case X86ISD::VPMADDWD: {
43555 APInt LHSUndef, LHSZero;
43556 APInt RHSUndef, RHSZero;
43557 SDValue LHS = Op.getOperand(0);
43558 SDValue RHS = Op.getOperand(1);
43559 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43560
43561 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43562 Depth + 1))
43563 return true;
43564 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43565 Depth + 1))
43566 return true;
43567
43568 // TODO: Multiply by zero.
43569
43570 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43571 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43572 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43573 Depth + 1))
43574 return true;
43575 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43576 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43577 Depth + 1))
43578 return true;
43579 break;
43580 }
43581 case X86ISD::PSADBW: {
43582 SDValue LHS = Op.getOperand(0);
43583 SDValue RHS = Op.getOperand(1);
43584 assert(VT.getScalarType() == MVT::i64 &&
43585 LHS.getValueType() == RHS.getValueType() &&
43586 LHS.getValueType().getScalarType() == MVT::i8 &&
43587 "Unexpected PSADBW types");
43588
43589 // Aggressively peek through ops to get at the demanded elts.
43590 if (!DemandedElts.isAllOnes()) {
43591 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43592 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43594 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43596 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43597 if (NewLHS || NewRHS) {
43598 NewLHS = NewLHS ? NewLHS : LHS;
43599 NewRHS = NewRHS ? NewRHS : RHS;
43600 return TLO.CombineTo(
43601 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43602 }
43603 }
43604 break;
43605 }
43606 case X86ISD::VSHL:
43607 case X86ISD::VSRL:
43608 case X86ISD::VSRA: {
43609 // We only need the bottom 64-bits of the (128-bit) shift amount.
43610 SDValue Amt = Op.getOperand(1);
43611 MVT AmtVT = Amt.getSimpleValueType();
43612 assert(AmtVT.is128BitVector() && "Unexpected value type");
43613
43614 // If we reuse the shift amount just for sse shift amounts then we know that
43615 // only the bottom 64-bits are only ever used.
43616 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43617 unsigned UseOpc = Use->getOpcode();
43618 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43619 UseOpc == X86ISD::VSRA) &&
43620 Use->getOperand(0) != Amt;
43621 });
43622
43623 APInt AmtUndef, AmtZero;
43624 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43625 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43626 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43627 Depth + 1, AssumeSingleUse))
43628 return true;
43629 [[fallthrough]];
43630 }
43631 case X86ISD::VSHLI:
43632 case X86ISD::VSRLI:
43633 case X86ISD::VSRAI: {
43634 SDValue Src = Op.getOperand(0);
43635 APInt SrcUndef;
43636 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43637 Depth + 1))
43638 return true;
43639
43640 // Fold shift(0,x) -> 0
43641 if (DemandedElts.isSubsetOf(KnownZero))
43642 return TLO.CombineTo(
43643 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43644
43645 // Aggressively peek through ops to get at the demanded elts.
43646 if (!DemandedElts.isAllOnes())
43648 Src, DemandedElts, TLO.DAG, Depth + 1))
43649 return TLO.CombineTo(
43650 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43651 break;
43652 }
43653 case X86ISD::VPSHA:
43654 case X86ISD::VPSHL:
43655 case X86ISD::VSHLV:
43656 case X86ISD::VSRLV:
43657 case X86ISD::VSRAV: {
43658 APInt LHSUndef, LHSZero;
43659 APInt RHSUndef, RHSZero;
43660 SDValue LHS = Op.getOperand(0);
43661 SDValue RHS = Op.getOperand(1);
43662 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43663 Depth + 1))
43664 return true;
43665
43666 // Fold shift(0,x) -> 0
43667 if (DemandedElts.isSubsetOf(LHSZero))
43668 return TLO.CombineTo(
43669 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43670
43671 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43672 Depth + 1))
43673 return true;
43674
43675 KnownZero = LHSZero;
43676 break;
43677 }
43678 case X86ISD::CMPM:
43679 case X86ISD::CMPP: {
43680 // Scalarize packed fp comparison if we only require element 0.
43681 if (DemandedElts == 1) {
43682 SDLoc dl(Op);
43683 MVT VT = Op.getSimpleValueType();
43684 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43685 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43686 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43687 SDValue CC = Op.getOperand(2);
43688 if (Opc == X86ISD::CMPM) {
43689 SDValue Cmp =
43690 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43691 return TLO.CombineTo(
43692 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43693 }
43694 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43695 return TLO.CombineTo(Op,
43696 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43697 }
43698 break;
43699 }
43700 case X86ISD::PCMPEQ:
43701 case X86ISD::PCMPGT: {
43702 APInt LHSUndef, LHSZero;
43703 APInt RHSUndef, RHSZero;
43704 SDValue LHS = Op.getOperand(0);
43705 SDValue RHS = Op.getOperand(1);
43706 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43707 Depth + 1))
43708 return true;
43709 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43710 Depth + 1))
43711 return true;
43712 break;
43713 }
43714 case X86ISD::KSHIFTL: {
43715 SDValue Src = Op.getOperand(0);
43716 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43717 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43718 unsigned ShiftAmt = Amt->getZExtValue();
43719
43720 if (ShiftAmt == 0)
43721 return TLO.CombineTo(Op, Src);
43722
43723 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43724 // single shift. We can do this if the bottom bits (which are shifted
43725 // out) are never demanded.
43726 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43727 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43728 unsigned C1 = Src.getConstantOperandVal(1);
43729 unsigned NewOpc = X86ISD::KSHIFTL;
43730 int Diff = ShiftAmt - C1;
43731 if (Diff < 0) {
43732 Diff = -Diff;
43733 NewOpc = X86ISD::KSHIFTR;
43734 }
43735
43736 SDLoc dl(Op);
43737 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43738 return TLO.CombineTo(
43739 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43740 }
43741 }
43742
43743 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43744 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43745 Depth + 1))
43746 return true;
43747
43748 KnownUndef <<= ShiftAmt;
43749 KnownZero <<= ShiftAmt;
43750 KnownZero.setLowBits(ShiftAmt);
43751 break;
43752 }
43753 case X86ISD::KSHIFTR: {
43754 SDValue Src = Op.getOperand(0);
43755 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43756 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43757 unsigned ShiftAmt = Amt->getZExtValue();
43758
43759 if (ShiftAmt == 0)
43760 return TLO.CombineTo(Op, Src);
43761
43762 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43763 // single shift. We can do this if the top bits (which are shifted
43764 // out) are never demanded.
43765 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43766 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43767 unsigned C1 = Src.getConstantOperandVal(1);
43768 unsigned NewOpc = X86ISD::KSHIFTR;
43769 int Diff = ShiftAmt - C1;
43770 if (Diff < 0) {
43771 Diff = -Diff;
43772 NewOpc = X86ISD::KSHIFTL;
43773 }
43774
43775 SDLoc dl(Op);
43776 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43777 return TLO.CombineTo(
43778 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43779 }
43780 }
43781
43782 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43783 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43784 Depth + 1))
43785 return true;
43786
43787 KnownUndef.lshrInPlace(ShiftAmt);
43788 KnownZero.lshrInPlace(ShiftAmt);
43789 KnownZero.setHighBits(ShiftAmt);
43790 break;
43791 }
43792 case X86ISD::ANDNP: {
43793 // ANDNP = (~LHS & RHS);
43794 SDValue LHS = Op.getOperand(0);
43795 SDValue RHS = Op.getOperand(1);
43796
43797 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43798 APInt UndefElts;
43799 SmallVector<APInt> EltBits;
43800 int NumElts = VT.getVectorNumElements();
43801 int EltSizeInBits = VT.getScalarSizeInBits();
43802 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43803 APInt OpElts = DemandedElts;
43804 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43805 EltBits)) {
43806 OpBits.clearAllBits();
43807 OpElts.clearAllBits();
43808 for (int I = 0; I != NumElts; ++I) {
43809 if (!DemandedElts[I])
43810 continue;
43811 if (UndefElts[I]) {
43812 // We can't assume an undef src element gives an undef dst - the
43813 // other src might be zero.
43814 OpBits.setAllBits();
43815 OpElts.setBit(I);
43816 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43817 (!Invert && !EltBits[I].isZero())) {
43818 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43819 OpElts.setBit(I);
43820 }
43821 }
43822 }
43823 return std::make_pair(OpBits, OpElts);
43824 };
43825 APInt BitsLHS, EltsLHS;
43826 APInt BitsRHS, EltsRHS;
43827 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43828 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43829
43830 APInt LHSUndef, LHSZero;
43831 APInt RHSUndef, RHSZero;
43832 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43833 Depth + 1))
43834 return true;
43835 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43836 Depth + 1))
43837 return true;
43838
43839 if (!DemandedElts.isAllOnes()) {
43840 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43841 TLO.DAG, Depth + 1);
43842 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43843 TLO.DAG, Depth + 1);
43844 if (NewLHS || NewRHS) {
43845 NewLHS = NewLHS ? NewLHS : LHS;
43846 NewRHS = NewRHS ? NewRHS : RHS;
43847 return TLO.CombineTo(
43848 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43849 }
43850 }
43851 break;
43852 }
43853 case X86ISD::CVTSI2P:
43854 case X86ISD::CVTUI2P:
43855 case X86ISD::CVTPH2PS:
43856 case X86ISD::CVTPS2PH: {
43857 SDValue Src = Op.getOperand(0);
43858 EVT SrcVT = Src.getValueType();
43859 APInt SrcUndef, SrcZero;
43860 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43861 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43862 Depth + 1))
43863 return true;
43864 break;
43865 }
43866 case X86ISD::PACKSS:
43867 case X86ISD::PACKUS: {
43868 SDValue N0 = Op.getOperand(0);
43869 SDValue N1 = Op.getOperand(1);
43870
43871 APInt DemandedLHS, DemandedRHS;
43872 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43873
43874 APInt LHSUndef, LHSZero;
43875 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43876 Depth + 1))
43877 return true;
43878 APInt RHSUndef, RHSZero;
43879 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43880 Depth + 1))
43881 return true;
43882
43883 // TODO - pass on known zero/undef.
43884
43885 // Aggressively peek through ops to get at the demanded elts.
43886 // TODO - we should do this for all target/faux shuffles ops.
43887 if (!DemandedElts.isAllOnes()) {
43888 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43889 TLO.DAG, Depth + 1);
43890 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43891 TLO.DAG, Depth + 1);
43892 if (NewN0 || NewN1) {
43893 NewN0 = NewN0 ? NewN0 : N0;
43894 NewN1 = NewN1 ? NewN1 : N1;
43895 return TLO.CombineTo(Op,
43896 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43897 }
43898 }
43899 break;
43900 }
43901 case X86ISD::HADD:
43902 case X86ISD::HSUB:
43903 case X86ISD::FHADD:
43904 case X86ISD::FHSUB: {
43905 SDValue N0 = Op.getOperand(0);
43906 SDValue N1 = Op.getOperand(1);
43907
43908 APInt DemandedLHS, DemandedRHS;
43909 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43910
43911 APInt LHSUndef, LHSZero;
43912 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43913 Depth + 1))
43914 return true;
43915 APInt RHSUndef, RHSZero;
43916 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43917 Depth + 1))
43918 return true;
43919
43920 // TODO - pass on known zero/undef.
43921
43922 // Aggressively peek through ops to get at the demanded elts.
43923 // TODO: Handle repeated operands.
43924 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43925 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43926 TLO.DAG, Depth + 1);
43927 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43928 TLO.DAG, Depth + 1);
43929 if (NewN0 || NewN1) {
43930 NewN0 = NewN0 ? NewN0 : N0;
43931 NewN1 = NewN1 ? NewN1 : N1;
43932 return TLO.CombineTo(Op,
43933 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43934 }
43935 }
43936 break;
43937 }
43938 case X86ISD::VTRUNC:
43939 case X86ISD::VTRUNCS:
43940 case X86ISD::VTRUNCUS: {
43941 SDValue Src = Op.getOperand(0);
43942 MVT SrcVT = Src.getSimpleValueType();
43943 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43944 APInt SrcUndef, SrcZero;
43945 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43946 Depth + 1))
43947 return true;
43948 KnownZero = SrcZero.zextOrTrunc(NumElts);
43949 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43950 break;
43951 }
43952 case X86ISD::BLENDI: {
43953 SmallVector<int, 16> BlendMask;
43954 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43956 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43957 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43958 return TLO.CombineTo(Op, R);
43959 break;
43960 }
43961 case X86ISD::BLENDV: {
43962 APInt SelUndef, SelZero;
43963 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43964 SelZero, TLO, Depth + 1))
43965 return true;
43966
43967 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43968 APInt LHSUndef, LHSZero;
43969 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43970 LHSZero, TLO, Depth + 1))
43971 return true;
43972
43973 APInt RHSUndef, RHSZero;
43974 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43975 RHSZero, TLO, Depth + 1))
43976 return true;
43977
43978 KnownZero = LHSZero & RHSZero;
43979 KnownUndef = LHSUndef & RHSUndef;
43980 break;
43981 }
43982 case X86ISD::VZEXT_MOVL: {
43983 // If upper demanded elements are already zero then we have nothing to do.
43984 SDValue Src = Op.getOperand(0);
43985 APInt DemandedUpperElts = DemandedElts;
43986 DemandedUpperElts.clearLowBits(1);
43987 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43988 return TLO.CombineTo(Op, Src);
43989 break;
43990 }
43991 case X86ISD::VZEXT_LOAD: {
43992 // If upper demanded elements are not demanded then simplify to a
43993 // scalar_to_vector(load()).
43995 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43996 SDLoc DL(Op);
43997 auto *Mem = cast<MemSDNode>(Op);
43998 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43999 Mem->getMemOperand());
44000 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
44001 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
44002 }
44003 break;
44004 }
44005 case X86ISD::VBROADCAST: {
44006 SDValue Src = Op.getOperand(0);
44007 MVT SrcVT = Src.getSimpleValueType();
44008 // Don't bother broadcasting if we just need the 0'th element.
44009 if (DemandedElts == 1) {
44010 if (!SrcVT.isVector())
44011 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
44012 else if (Src.getValueType() != VT)
44013 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
44014 SDLoc(Op));
44015 return TLO.CombineTo(Op, Src);
44016 }
44017 if (!SrcVT.isVector())
44018 break;
44019 APInt SrcUndef, SrcZero;
44020 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
44021 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
44022 Depth + 1))
44023 return true;
44024 // Aggressively peek through src to get at the demanded elt.
44025 // TODO - we should do this for all target/faux shuffles ops.
44027 Src, SrcElts, TLO.DAG, Depth + 1))
44028 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44029 break;
44030 }
44031 case X86ISD::VPERMV:
44032 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44033 Depth))
44034 return true;
44035 break;
44036 case X86ISD::PSHUFB:
44037 case X86ISD::VPERMV3:
44038 case X86ISD::VPERMILPV:
44039 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44040 Depth))
44041 return true;
44042 break;
44043 case X86ISD::VPPERM:
44044 case X86ISD::VPERMIL2:
44045 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44046 Depth))
44047 return true;
44048 break;
44049 }
44050
44051 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44052 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44053 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44054 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44055 DemandedElts.lshr(NumElts / 2) == 0) {
44056 unsigned SizeInBits = VT.getSizeInBits();
44057 unsigned ExtSizeInBits = SizeInBits / 2;
44058
44059 // See if 512-bit ops only use the bottom 128-bits.
44060 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44061 ExtSizeInBits = SizeInBits / 4;
44062
44063 switch (Opc) {
44064 // Scalar broadcast.
44065 case X86ISD::VBROADCAST: {
44066 SDLoc DL(Op);
44067 SDValue Src = Op.getOperand(0);
44068 if (Src.getValueSizeInBits() > ExtSizeInBits)
44069 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44070 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44071 ExtSizeInBits / VT.getScalarSizeInBits());
44072 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44073 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44074 TLO.DAG, DL, ExtSizeInBits));
44075 }
44077 SDLoc DL(Op);
44078 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44079 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44080 ExtSizeInBits / VT.getScalarSizeInBits());
44081 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44082 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44083 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44084 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44085 MemIntr->getMemOperand());
44087 Bcst.getValue(1));
44088 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44089 TLO.DAG, DL, ExtSizeInBits));
44090 }
44091 // Subvector broadcast.
44093 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44094 EVT MemVT = MemIntr->getMemoryVT();
44095 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44096 SDLoc DL(Op);
44097 SDValue Ld =
44098 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44099 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44101 Ld.getValue(1));
44102 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44103 TLO.DAG, DL, ExtSizeInBits));
44104 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44105 SDLoc DL(Op);
44106 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44107 ExtSizeInBits / VT.getScalarSizeInBits());
44108 if (SDValue BcstLd =
44109 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44110 return TLO.CombineTo(Op,
44111 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44112 TLO.DAG, DL, ExtSizeInBits));
44113 }
44114 break;
44115 }
44116 // Byte shifts by immediate.
44117 case X86ISD::VSHLDQ:
44118 case X86ISD::VSRLDQ:
44119 // Shift by uniform.
44120 case X86ISD::VSHL:
44121 case X86ISD::VSRL:
44122 case X86ISD::VSRA:
44123 // Shift by immediate.
44124 case X86ISD::VSHLI:
44125 case X86ISD::VSRLI:
44126 case X86ISD::VSRAI: {
44127 SDLoc DL(Op);
44128 SDValue Ext0 =
44129 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44130 SDValue ExtOp =
44131 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44132 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44133 SDValue Insert =
44134 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44135 return TLO.CombineTo(Op, Insert);
44136 }
44137 case X86ISD::VPERMI: {
44138 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44139 // TODO: This should be done in shuffle combining.
44140 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44142 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44143 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44144 SDLoc DL(Op);
44145 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44146 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44147 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44148 return TLO.CombineTo(Op, Insert);
44149 }
44150 }
44151 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44152 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44153 SDLoc DL(Op);
44154 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44155 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44156 Op.getOperand(1));
44157 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44158 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44159 return TLO.CombineTo(Op, Insert);
44160 }
44161 break;
44162 }
44163 case X86ISD::VPERMV: {
44166 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44167 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44168 VT == MVT::v16f32) &&
44169 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44170 // For lane-crossing shuffles, only split in half in case we're still
44171 // referencing higher elements.
44172 unsigned HalfElts = NumElts / 2;
44173 unsigned HalfSize = SizeInBits / 2;
44174 Mask.resize(HalfElts);
44175 if (all_of(Mask,
44176 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44178 SDLoc DL(Op);
44179 SDValue Ext;
44180 SDValue M =
44181 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44182 SDValue V =
44183 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44184 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44185 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44186 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44187 else {
44189 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44190 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44191 TLO.DAG.getBitcast(ShufVT, V), M);
44192 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44193 }
44194 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44195 Subtarget, TLO.DAG, DL, SizeInBits);
44196 return TLO.CombineTo(Op, Insert);
44197 }
44198 }
44199 break;
44200 }
44201 case X86ISD::VPERMV3: {
44204 if (Subtarget.hasVLX() &&
44205 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44206 // For lane-crossing shuffles, only split in half in case we're still
44207 // referencing higher elements.
44208 unsigned HalfElts = NumElts / 2;
44209 unsigned HalfSize = SizeInBits / 2;
44210 Mask.resize(HalfElts);
44211 if (all_of(Mask, [&](int M) {
44212 return isUndefOrInRange(M, 0, HalfElts) ||
44213 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44214 })) {
44215 // Adjust mask elements for 2nd operand to point to half width.
44216 for (int &M : Mask)
44217 M = (M < NumElts) ? M : (M - HalfElts);
44219 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44220 SDLoc DL(Op);
44221 SDValue Ext = TLO.DAG.getNode(
44222 Opc, DL, HalfVT,
44223 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44224 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44225 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44226 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44227 Subtarget, TLO.DAG, DL, SizeInBits);
44228 return TLO.CombineTo(Op, Insert);
44229 }
44230 }
44231 break;
44232 }
44233 case X86ISD::VPERM2X128: {
44234 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44235 SDLoc DL(Op);
44236 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44237 if (LoMask & 0x8)
44238 return TLO.CombineTo(
44239 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44240 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44241 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44242 SDValue ExtOp =
44243 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44244 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44245 SDValue Insert =
44246 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44247 return TLO.CombineTo(Op, Insert);
44248 }
44249 // Conversions.
44250 // TODO: Add more CVT opcodes when we have test coverage.
44251 case X86ISD::CVTTP2UI: {
44252 if (!Subtarget.hasVLX())
44253 break;
44254 [[fallthrough]];
44255 }
44256 case X86ISD::CVTTP2SI: {
44257 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44258 !Subtarget.hasVLX())
44259 break;
44260 [[fallthrough]];
44261 }
44262 case X86ISD::CVTPH2PS: {
44263 SDLoc DL(Op);
44264 unsigned Scale = SizeInBits / ExtSizeInBits;
44265 SDValue SrcOp = Op.getOperand(0);
44266 MVT SrcVT = SrcOp.getSimpleValueType();
44267 unsigned SrcExtSize =
44268 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44270 ExtSizeInBits / VT.getScalarSizeInBits());
44271 SDValue ExtOp = TLO.DAG.getNode(
44272 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44273 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44274 SDValue Insert =
44275 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44276 return TLO.CombineTo(Op, Insert);
44277 }
44278 // Zero upper elements.
44279 case X86ISD::VZEXT_MOVL:
44280 // Variable blend.
44281 case X86ISD::BLENDV:
44282 // Target unary shuffles:
44283 case X86ISD::MOVDDUP:
44284 // Target unary shuffles by immediate:
44285 case X86ISD::PSHUFD:
44286 case X86ISD::PSHUFLW:
44287 case X86ISD::PSHUFHW:
44288 case X86ISD::VPERMILPI:
44289 // (Non-Lane Crossing) Target Shuffles.
44290 case X86ISD::VPERMILPV:
44291 case X86ISD::VPERMIL2:
44292 case X86ISD::PSHUFB:
44293 case X86ISD::UNPCKL:
44294 case X86ISD::UNPCKH:
44295 case X86ISD::BLENDI:
44296 // Integer ops.
44297 case X86ISD::PACKSS:
44298 case X86ISD::PACKUS:
44299 case X86ISD::PCMPEQ:
44300 case X86ISD::PCMPGT:
44301 case X86ISD::PMULUDQ:
44302 case X86ISD::PMULDQ:
44303 case X86ISD::VSHLV:
44304 case X86ISD::VSRLV:
44305 case X86ISD::VSRAV:
44306 // Float ops.
44307 case X86ISD::FMAX:
44308 case X86ISD::FMIN:
44309 case X86ISD::FMAXC:
44310 case X86ISD::FMINC:
44311 case X86ISD::FRSQRT:
44312 case X86ISD::FRCP:
44313 // Horizontal Ops.
44314 case X86ISD::HADD:
44315 case X86ISD::HSUB:
44316 case X86ISD::FHADD:
44317 case X86ISD::FHSUB: {
44318 SDLoc DL(Op);
44320 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44321 SDValue SrcOp = Op.getOperand(i);
44322 EVT SrcVT = SrcOp.getValueType();
44323 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44324 "Unsupported vector size");
44325 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44326 ExtSizeInBits)
44327 : SrcOp);
44328 }
44329 MVT ExtVT = VT.getSimpleVT();
44330 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44331 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44332 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44333 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44334 SDValue Insert =
44335 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44336 return TLO.CombineTo(Op, Insert);
44337 }
44338 }
44339 }
44340
44341 // For splats, unless we *only* demand the 0'th element,
44342 // stop attempts at simplification here, we aren't going to improve things,
44343 // this is better than any potential shuffle.
44344 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44345 return false;
44346
44347 // Get target/faux shuffle mask.
44348 APInt OpUndef, OpZero;
44349 SmallVector<int, 64> OpMask;
44350 SmallVector<SDValue, 2> OpInputs;
44351 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44352 OpZero, TLO.DAG, Depth, false))
44353 return false;
44354
44355 // Shuffle inputs must be the same size as the result.
44356 if (OpMask.size() != (unsigned)NumElts ||
44357 llvm::any_of(OpInputs, [VT](SDValue V) {
44358 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44359 !V.getValueType().isVector();
44360 }))
44361 return false;
44362
44363 KnownZero = OpZero;
44364 KnownUndef = OpUndef;
44365
44366 // Check if shuffle mask can be simplified to undef/zero/identity.
44367 int NumSrcs = OpInputs.size();
44368 for (int i = 0; i != NumElts; ++i)
44369 if (!DemandedElts[i])
44370 OpMask[i] = SM_SentinelUndef;
44371
44372 if (isUndefInRange(OpMask, 0, NumElts)) {
44373 KnownUndef.setAllBits();
44374 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44375 }
44376 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44377 KnownZero.setAllBits();
44378 return TLO.CombineTo(
44379 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44380 }
44381 for (int Src = 0; Src != NumSrcs; ++Src)
44382 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44383 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44384
44385 // Attempt to simplify inputs.
44386 for (int Src = 0; Src != NumSrcs; ++Src) {
44387 // TODO: Support inputs of different types.
44388 if (OpInputs[Src].getValueType() != VT)
44389 continue;
44390
44391 int Lo = Src * NumElts;
44392 APInt SrcElts = APInt::getZero(NumElts);
44393 for (int i = 0; i != NumElts; ++i)
44394 if (DemandedElts[i]) {
44395 int M = OpMask[i] - Lo;
44396 if (0 <= M && M < NumElts)
44397 SrcElts.setBit(M);
44398 }
44399
44400 // TODO - Propagate input undef/zero elts.
44401 APInt SrcUndef, SrcZero;
44402 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44403 TLO, Depth + 1))
44404 return true;
44405 }
44406
44407 // If we don't demand all elements, then attempt to combine to a simpler
44408 // shuffle.
44409 // We need to convert the depth to something combineX86ShufflesRecursively
44410 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44411 // to match. This prevents combineX86ShuffleChain from returning a
44412 // combined shuffle that's the same as the original root, causing an
44413 // infinite loop.
44414 if (!DemandedElts.isAllOnes()) {
44415 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44416
44417 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44418 for (int i = 0; i != NumElts; ++i)
44419 if (DemandedElts[i])
44420 DemandedMask[i] = i;
44421
44423 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44425 /*AllowVariableCrossLaneMask=*/true,
44426 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44427 TLO.DAG, SDLoc(Op), Subtarget);
44428 if (NewShuffle)
44429 return TLO.CombineTo(Op, NewShuffle);
44430 }
44431
44432 return false;
44433}
44434
44436 SDValue Op, const APInt &OriginalDemandedBits,
44437 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44438 unsigned Depth) const {
44439 EVT VT = Op.getValueType();
44440 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44441 unsigned Opc = Op.getOpcode();
44442 switch(Opc) {
44443 case X86ISD::VTRUNC: {
44444 KnownBits KnownOp;
44445 SDValue Src = Op.getOperand(0);
44446 MVT SrcVT = Src.getSimpleValueType();
44447
44448 // Simplify the input, using demanded bit information.
44449 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44450 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44451 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44452 return true;
44453 break;
44454 }
44455 case X86ISD::PMULDQ:
44456 case X86ISD::PMULUDQ: {
44457 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44458 KnownBits KnownLHS, KnownRHS;
44459 SDValue LHS = Op.getOperand(0);
44460 SDValue RHS = Op.getOperand(1);
44461
44462 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44463 // FIXME: Can we bound this better?
44464 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44465 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44466 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44467
44468 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44469 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44470 DemandedMaskLHS = DemandedMask;
44471 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44472 DemandedMaskRHS = DemandedMask;
44473
44474 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44475 KnownLHS, TLO, Depth + 1))
44476 return true;
44477 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44478 KnownRHS, TLO, Depth + 1))
44479 return true;
44480
44481 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44482 KnownRHS = KnownRHS.trunc(32);
44483 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44484 KnownRHS.getConstant().isOne()) {
44485 SDLoc DL(Op);
44486 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44487 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44488 }
44489
44490 // Aggressively peek through ops to get at the demanded low bits.
44492 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44494 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44495 if (DemandedLHS || DemandedRHS) {
44496 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44497 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44498 return TLO.CombineTo(
44499 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44500 }
44501 break;
44502 }
44503 case X86ISD::ANDNP: {
44504 KnownBits Known2;
44505 SDValue Op0 = Op.getOperand(0);
44506 SDValue Op1 = Op.getOperand(1);
44507
44508 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44509 Known, TLO, Depth + 1))
44510 return true;
44511
44512 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44513 OriginalDemandedElts, Known2, TLO, Depth + 1))
44514 return true;
44515
44516 // If the RHS is a constant, see if we can simplify it.
44517 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44518 OriginalDemandedElts, TLO))
44519 return true;
44520
44521 // ANDNP = (~Op0 & Op1);
44522 Known.One &= Known2.Zero;
44523 Known.Zero |= Known2.One;
44524 break;
44525 }
44526 case X86ISD::VSHLI: {
44527 SDValue Op0 = Op.getOperand(0);
44528 SDValue Op1 = Op.getOperand(1);
44529
44530 unsigned ShAmt = Op1->getAsZExtVal();
44531 if (ShAmt >= BitWidth)
44532 break;
44533
44534 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44535
44536 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44537 // single shift. We can do this if the bottom bits (which are shifted
44538 // out) are never demanded.
44539 if (Op0.getOpcode() == X86ISD::VSRLI &&
44540 OriginalDemandedBits.countr_zero() >= ShAmt) {
44541 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44542 if (Shift2Amt < BitWidth) {
44543 int Diff = ShAmt - Shift2Amt;
44544 if (Diff == 0)
44545 return TLO.CombineTo(Op, Op0.getOperand(0));
44546
44547 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44548 SDValue NewShift = TLO.DAG.getNode(
44549 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44550 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44551 return TLO.CombineTo(Op, NewShift);
44552 }
44553 }
44554
44555 // If we are only demanding sign bits then we can use the shift source directly.
44556 unsigned NumSignBits =
44557 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44558 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44559 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44560 return TLO.CombineTo(Op, Op0);
44561
44562 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44563 TLO, Depth + 1))
44564 return true;
44565
44566 Known <<= ShAmt;
44567
44568 // Low bits known zero.
44569 Known.Zero.setLowBits(ShAmt);
44570
44571 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44572 // Attempt to avoid multi-use ops if we don't need anything from them.
44573 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44574 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44575 SDValue NewOp =
44576 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44577 return TLO.CombineTo(Op, NewOp);
44578 }
44579 }
44580 return false;
44581 }
44582 case X86ISD::VSRLI: {
44583 SDValue Op0 = Op.getOperand(0);
44584 SDValue Op1 = Op.getOperand(1);
44585
44586 unsigned ShAmt = Op1->getAsZExtVal();
44587 if (ShAmt >= BitWidth)
44588 break;
44589
44590 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44591
44592 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44593 TLO, Depth + 1))
44594 return true;
44595
44596 Known >>= ShAmt;
44597
44598 // High bits known zero.
44599 Known.Zero.setHighBits(ShAmt);
44600
44601 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44602 // Attempt to avoid multi-use ops if we don't need anything from them.
44603 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44604 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44605 SDValue NewOp =
44606 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44607 return TLO.CombineTo(Op, NewOp);
44608 }
44609 }
44610 return false;
44611 }
44612 case X86ISD::VSRAI: {
44613 SDValue Op0 = Op.getOperand(0);
44614 SDValue Op1 = Op.getOperand(1);
44615
44616 unsigned ShAmt = Op1->getAsZExtVal();
44617 if (ShAmt >= BitWidth)
44618 break;
44619
44620 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44621
44622 // If we just want the sign bit then we don't need to shift it.
44623 if (OriginalDemandedBits.isSignMask())
44624 return TLO.CombineTo(Op, Op0);
44625
44626 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44627 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44628 SDValue Op00 = Op0.getOperand(0);
44629 unsigned NumSignBits =
44630 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44631 if (ShAmt < NumSignBits)
44632 return TLO.CombineTo(Op, Op00);
44633 }
44634
44635 // If any of the demanded bits are produced by the sign extension, we also
44636 // demand the input sign bit.
44637 if (OriginalDemandedBits.countl_zero() < ShAmt)
44638 DemandedMask.setSignBit();
44639
44640 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44641 TLO, Depth + 1))
44642 return true;
44643
44644 Known >>= ShAmt;
44645
44646 // If the input sign bit is known to be zero, or if none of the top bits
44647 // are demanded, turn this into an unsigned shift right.
44648 if (Known.Zero[BitWidth - ShAmt - 1] ||
44649 OriginalDemandedBits.countl_zero() >= ShAmt)
44650 return TLO.CombineTo(
44651 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44652
44653 // High bits are known one.
44654 if (Known.One[BitWidth - ShAmt - 1])
44655 Known.One.setHighBits(ShAmt);
44656
44657 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44658 // Attempt to avoid multi-use ops if we don't need anything from them.
44659 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44660 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44661 SDValue NewOp =
44662 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44663 return TLO.CombineTo(Op, NewOp);
44664 }
44665 }
44666 return false;
44667 }
44668 case X86ISD::BLENDI: {
44669 SDValue LHS = Op.getOperand(0);
44670 SDValue RHS = Op.getOperand(1);
44671 APInt Mask = getBLENDIBlendMask(Op);
44672
44673 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44674 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44675 TLO, Depth + 1))
44676 return true;
44677
44678 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44679 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44680 TLO, Depth + 1))
44681 return true;
44682
44683 // Attempt to avoid multi-use ops if we don't need anything from them.
44685 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44687 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44688 if (NewLHS || NewRHS) {
44689 NewLHS = NewLHS ? NewLHS : LHS;
44690 NewRHS = NewRHS ? NewRHS : RHS;
44691 return TLO.CombineTo(Op,
44692 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44693 NewLHS, NewRHS, Op.getOperand(2)));
44694 }
44695 break;
44696 }
44697 case X86ISD::BLENDV: {
44698 SDValue Sel = Op.getOperand(0);
44699 SDValue LHS = Op.getOperand(1);
44700 SDValue RHS = Op.getOperand(2);
44701
44702 APInt SignMask = APInt::getSignMask(BitWidth);
44704 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44706 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44708 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44709
44710 if (NewSel || NewLHS || NewRHS) {
44711 NewSel = NewSel ? NewSel : Sel;
44712 NewLHS = NewLHS ? NewLHS : LHS;
44713 NewRHS = NewRHS ? NewRHS : RHS;
44714 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44715 NewSel, NewLHS, NewRHS));
44716 }
44717 break;
44718 }
44719 case X86ISD::PEXTRB:
44720 case X86ISD::PEXTRW: {
44721 SDValue Vec = Op.getOperand(0);
44722 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44723 MVT VecVT = Vec.getSimpleValueType();
44724 unsigned NumVecElts = VecVT.getVectorNumElements();
44725
44726 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44727 unsigned Idx = CIdx->getZExtValue();
44728 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44729
44730 // If we demand no bits from the vector then we must have demanded
44731 // bits from the implict zext - simplify to zero.
44732 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44733 if (DemandedVecBits == 0)
44734 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44735
44736 APInt KnownUndef, KnownZero;
44737 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44738 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44739 KnownZero, TLO, Depth + 1))
44740 return true;
44741
44742 KnownBits KnownVec;
44743 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44744 KnownVec, TLO, Depth + 1))
44745 return true;
44746
44748 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44749 return TLO.CombineTo(
44750 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44751
44752 Known = KnownVec.zext(BitWidth);
44753 return false;
44754 }
44755 break;
44756 }
44757 case X86ISD::PINSRB:
44758 case X86ISD::PINSRW: {
44759 SDValue Vec = Op.getOperand(0);
44760 SDValue Scl = Op.getOperand(1);
44761 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44762 MVT VecVT = Vec.getSimpleValueType();
44763
44764 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44765 unsigned Idx = CIdx->getZExtValue();
44766 if (!OriginalDemandedElts[Idx])
44767 return TLO.CombineTo(Op, Vec);
44768
44769 KnownBits KnownVec;
44770 APInt DemandedVecElts(OriginalDemandedElts);
44771 DemandedVecElts.clearBit(Idx);
44772 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44773 KnownVec, TLO, Depth + 1))
44774 return true;
44775
44776 KnownBits KnownScl;
44777 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44778 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44779 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44780 return true;
44781
44782 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44783 Known = KnownVec.intersectWith(KnownScl);
44784 return false;
44785 }
44786 break;
44787 }
44788 case X86ISD::PACKSS:
44789 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44790 // sign bit then we can just ask for the source operands sign bit.
44791 // TODO - add known bits handling.
44792 if (OriginalDemandedBits.isSignMask()) {
44793 APInt DemandedLHS, DemandedRHS;
44794 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44795
44796 KnownBits KnownLHS, KnownRHS;
44797 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44798 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44799 KnownLHS, TLO, Depth + 1))
44800 return true;
44801 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44802 KnownRHS, TLO, Depth + 1))
44803 return true;
44804
44805 // Attempt to avoid multi-use ops if we don't need anything from them.
44807 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44809 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44810 if (DemandedOp0 || DemandedOp1) {
44811 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44812 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44813 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44814 }
44815 }
44816 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44817 break;
44818 case X86ISD::VBROADCAST: {
44819 SDValue Src = Op.getOperand(0);
44820 MVT SrcVT = Src.getSimpleValueType();
44821 APInt DemandedElts = APInt::getOneBitSet(
44822 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44823 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44824 TLO, Depth + 1))
44825 return true;
44826 // If we don't need the upper bits, attempt to narrow the broadcast source.
44827 // Don't attempt this on AVX512 as it might affect broadcast folding.
44828 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44829 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44830 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44831 Src->hasOneUse()) {
44832 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44833 SDValue NewSrc =
44834 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44835 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44836 SDValue NewBcst =
44837 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44838 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44839 }
44840 break;
44841 }
44842 case X86ISD::PCMPGT:
44843 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44844 // iff we only need the sign bit then we can use R directly.
44845 if (OriginalDemandedBits.isSignMask() &&
44846 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44847 return TLO.CombineTo(Op, Op.getOperand(1));
44848 break;
44849 case X86ISD::MOVMSK: {
44850 SDValue Src = Op.getOperand(0);
44851 MVT SrcVT = Src.getSimpleValueType();
44852 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44853 unsigned NumElts = SrcVT.getVectorNumElements();
44854
44855 // If we don't need the sign bits at all just return zero.
44856 if (OriginalDemandedBits.countr_zero() >= NumElts)
44857 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44858
44859 // See if we only demand bits from the lower 128-bit vector.
44860 if (SrcVT.is256BitVector() &&
44861 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44862 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44863 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44864 }
44865
44866 // Only demand the vector elements of the sign bits we need.
44867 APInt KnownUndef, KnownZero;
44868 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44869 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44870 TLO, Depth + 1))
44871 return true;
44872
44873 Known.Zero = KnownZero.zext(BitWidth);
44874 Known.Zero.setHighBits(BitWidth - NumElts);
44875
44876 // MOVMSK only uses the MSB from each vector element.
44877 KnownBits KnownSrc;
44878 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44879 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44880 Depth + 1))
44881 return true;
44882
44883 if (KnownSrc.One[SrcBits - 1])
44884 Known.One.setLowBits(NumElts);
44885 else if (KnownSrc.Zero[SrcBits - 1])
44886 Known.Zero.setLowBits(NumElts);
44887
44888 // Attempt to avoid multi-use os if we don't need anything from it.
44890 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44891 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44892 return false;
44893 }
44894 case X86ISD::TESTP: {
44895 SDValue Op0 = Op.getOperand(0);
44896 SDValue Op1 = Op.getOperand(1);
44897 MVT OpVT = Op0.getSimpleValueType();
44898 assert((OpVT.getVectorElementType() == MVT::f32 ||
44899 OpVT.getVectorElementType() == MVT::f64) &&
44900 "Illegal vector type for X86ISD::TESTP");
44901
44902 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44903 KnownBits KnownSrc;
44904 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44905 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44906 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44907 AssumeSingleUse) ||
44908 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44909 AssumeSingleUse);
44910 }
44911 case X86ISD::CMOV: {
44912 KnownBits Known2;
44913 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44914 OriginalDemandedElts, Known2, TLO, Depth + 1))
44915 return true;
44916 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44917 OriginalDemandedElts, Known, TLO, Depth + 1))
44918 return true;
44919
44920 // Only known if known in both the LHS and RHS.
44921 Known = Known.intersectWith(Known2);
44922 return false;
44923 }
44924 case X86ISD::BEXTR:
44925 case X86ISD::BEXTRI: {
44926 SDValue Op0 = Op.getOperand(0);
44927 SDValue Op1 = Op.getOperand(1);
44928
44929 // Only bottom 16-bits of the control bits are required.
44930 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44931 // NOTE: SimplifyDemandedBits won't do this for constants.
44932 uint64_t Val1 = Cst1->getZExtValue();
44933 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44934 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44935 SDLoc DL(Op);
44936 return TLO.CombineTo(
44937 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44938 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44939 }
44940
44941 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44942 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44943
44944 // If the length is 0, the result is 0.
44945 if (Length == 0) {
44946 Known.setAllZero();
44947 return false;
44948 }
44949
44950 if ((Shift + Length) <= BitWidth) {
44951 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44952 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44953 return true;
44954
44955 Known = Known.extractBits(Length, Shift);
44956 Known = Known.zextOrTrunc(BitWidth);
44957 return false;
44958 }
44959 } else {
44960 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44961 KnownBits Known1;
44962 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44963 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44964 return true;
44965
44966 // If the length is 0, replace with 0.
44967 KnownBits LengthBits = Known1.extractBits(8, 8);
44968 if (LengthBits.isZero())
44969 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44970 }
44971
44972 break;
44973 }
44974 case X86ISD::PDEP: {
44975 SDValue Op0 = Op.getOperand(0);
44976 SDValue Op1 = Op.getOperand(1);
44977
44978 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44979 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44980
44981 // If the demanded bits has leading zeroes, we don't demand those from the
44982 // mask.
44983 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44984 return true;
44985
44986 // The number of possible 1s in the mask determines the number of LSBs of
44987 // operand 0 used. Undemanded bits from the mask don't matter so filter
44988 // them before counting.
44989 KnownBits Known2;
44990 uint64_t Count = (~Known.Zero & LoMask).popcount();
44991 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44992 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44993 return true;
44994
44995 // Zeroes are retained from the mask, but not ones.
44996 Known.One.clearAllBits();
44997 // The result will have at least as many trailing zeros as the non-mask
44998 // operand since bits can only map to the same or higher bit position.
44999 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
45000 return false;
45001 }
45002 case X86ISD::VPMADD52L:
45003 case X86ISD::VPMADD52H: {
45004 KnownBits KnownOp0, KnownOp1, KnownOp2;
45005 SDValue Op0 = Op.getOperand(0);
45006 SDValue Op1 = Op.getOperand(1);
45007 SDValue Op2 = Op.getOperand(2);
45008 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
45009 // operand 2).
45010 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
45011 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
45012 TLO, Depth + 1))
45013 return true;
45014
45015 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
45016 TLO, Depth + 1))
45017 return true;
45018
45019 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
45020 KnownOp2, TLO, Depth + 1))
45021 return true;
45022
45023 KnownBits KnownMul;
45024 KnownOp0 = KnownOp0.trunc(52);
45025 KnownOp1 = KnownOp1.trunc(52);
45026 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45027 : KnownBits::mulhu(KnownOp0, KnownOp1);
45028 KnownMul = KnownMul.zext(64);
45029
45030 // lo/hi(X * Y) + Z --> C + Z
45031 if (KnownMul.isConstant()) {
45032 SDLoc DL(Op);
45033 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45034 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45035 }
45036
45037 Known = KnownBits::add(KnownMul, KnownOp2);
45038 return false;
45039 }
45040 }
45041
45043 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45044}
45045
45047 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45048 SelectionDAG &DAG, unsigned Depth) const {
45049 int NumElts = DemandedElts.getBitWidth();
45050 unsigned Opc = Op.getOpcode();
45051 EVT VT = Op.getValueType();
45052
45053 switch (Opc) {
45054 case X86ISD::PINSRB:
45055 case X86ISD::PINSRW: {
45056 // If we don't demand the inserted element, return the base vector.
45057 SDValue Vec = Op.getOperand(0);
45058 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45059 MVT VecVT = Vec.getSimpleValueType();
45060 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45061 !DemandedElts[CIdx->getZExtValue()])
45062 return Vec;
45063 break;
45064 }
45065 case X86ISD::VSHLI: {
45066 // If we are only demanding sign bits then we can use the shift source
45067 // directly.
45068 SDValue Op0 = Op.getOperand(0);
45069 unsigned ShAmt = Op.getConstantOperandVal(1);
45070 unsigned BitWidth = DemandedBits.getBitWidth();
45071 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45072 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45073 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45074 return Op0;
45075 break;
45076 }
45077 case X86ISD::VSRAI:
45078 // iff we only need the sign bit then we can use the source directly.
45079 // TODO: generalize where we only demand extended signbits.
45080 if (DemandedBits.isSignMask())
45081 return Op.getOperand(0);
45082 break;
45083 case X86ISD::PCMPGT:
45084 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45085 // iff we only need the sign bit then we can use R directly.
45086 if (DemandedBits.isSignMask() &&
45087 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45088 return Op.getOperand(1);
45089 break;
45090 case X86ISD::BLENDV: {
45091 // BLENDV: Cond (MSB) ? LHS : RHS
45092 SDValue Cond = Op.getOperand(0);
45093 SDValue LHS = Op.getOperand(1);
45094 SDValue RHS = Op.getOperand(2);
45095
45096 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45097 if (CondKnown.isNegative())
45098 return LHS;
45099 if (CondKnown.isNonNegative())
45100 return RHS;
45101 break;
45102 }
45103 case X86ISD::ANDNP: {
45104 // ANDNP = (~LHS & RHS);
45105 SDValue LHS = Op.getOperand(0);
45106 SDValue RHS = Op.getOperand(1);
45107
45108 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45109 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45110
45111 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45112 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45113 // this context, so return RHS.
45114 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45115 return RHS;
45116 break;
45117 }
45118 }
45119
45120 APInt ShuffleUndef, ShuffleZero;
45121 SmallVector<int, 16> ShuffleMask;
45123 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45124 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45125 // If all the demanded elts are from one operand and are inline,
45126 // then we can use the operand directly.
45127 int NumOps = ShuffleOps.size();
45128 if (ShuffleMask.size() == (unsigned)NumElts &&
45130 return VT.getSizeInBits() == V.getValueSizeInBits();
45131 })) {
45132
45133 if (DemandedElts.isSubsetOf(ShuffleUndef))
45134 return DAG.getUNDEF(VT);
45135 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45136 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45137
45138 // Bitmask that indicates which ops have only been accessed 'inline'.
45139 APInt IdentityOp = APInt::getAllOnes(NumOps);
45140 for (int i = 0; i != NumElts; ++i) {
45141 int M = ShuffleMask[i];
45142 if (!DemandedElts[i] || ShuffleUndef[i])
45143 continue;
45144 int OpIdx = M / NumElts;
45145 int EltIdx = M % NumElts;
45146 if (M < 0 || EltIdx != i) {
45147 IdentityOp.clearAllBits();
45148 break;
45149 }
45150 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45151 if (IdentityOp == 0)
45152 break;
45153 }
45154 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45155 "Multiple identity shuffles detected");
45156
45157 if (IdentityOp != 0)
45158 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45159 }
45160 }
45161
45163 Op, DemandedBits, DemandedElts, DAG, Depth);
45164}
45165
45167 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45168 bool PoisonOnly, unsigned Depth) const {
45169 unsigned NumElts = DemandedElts.getBitWidth();
45170
45171 switch (Op.getOpcode()) {
45173 case X86ISD::Wrapper:
45174 case X86ISD::WrapperRIP:
45175 return true;
45176 case X86ISD::BLENDI:
45177 case X86ISD::PSHUFD:
45178 case X86ISD::UNPCKL:
45179 case X86ISD::UNPCKH:
45180 case X86ISD::VPERMILPI:
45181 case X86ISD::VPERMV3: {
45184 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45185 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45186 APInt::getZero(NumElts));
45187 for (auto M : enumerate(Mask)) {
45188 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45189 continue;
45190 if (M.value() == SM_SentinelUndef)
45191 return false;
45192 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45193 "Shuffle mask index out of range");
45194 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45195 }
45196 for (auto Op : enumerate(Ops))
45197 if (!DemandedSrcElts[Op.index()].isZero() &&
45199 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45200 return false;
45201 return true;
45202 }
45203 break;
45204 }
45205 }
45207 Op, DemandedElts, DAG, PoisonOnly, Depth);
45208}
45209
45211 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45212 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45213
45214 switch (Op.getOpcode()) {
45215 // SSE bit logic.
45216 case X86ISD::FAND:
45217 case X86ISD::FOR:
45218 case X86ISD::FXOR:
45219 case X86ISD::FANDN:
45220 case X86ISD::ANDNP:
45221 case X86ISD::VPTERNLOG:
45222 return false;
45223 // SSE vector insert/extracts use modulo indices.
45224 case X86ISD::PINSRB:
45225 case X86ISD::PINSRW:
45226 case X86ISD::PEXTRB:
45227 case X86ISD::PEXTRW:
45228 return false;
45229 // SSE vector multiplies are either inbounds or saturate.
45230 case X86ISD::VPMADDUBSW:
45231 case X86ISD::VPMADDWD:
45232 return false;
45233 // SSE vector shifts handle out of bounds shift amounts.
45234 case X86ISD::VSHLI:
45235 case X86ISD::VSRLI:
45236 case X86ISD::VSRAI:
45237 return false;
45238 // SSE blends.
45239 case X86ISD::BLENDI:
45240 case X86ISD::BLENDV:
45241 return false;
45242 // SSE target shuffles.
45243 case X86ISD::PSHUFD:
45244 case X86ISD::UNPCKL:
45245 case X86ISD::UNPCKH:
45246 case X86ISD::VPERMILPI:
45247 case X86ISD::VPERMV3:
45248 return false;
45249 // SSE comparisons handle all icmp/fcmp cases.
45250 // TODO: Add CMPM/MM with test coverage.
45251 case X86ISD::CMPP:
45252 case X86ISD::PCMPEQ:
45253 case X86ISD::PCMPGT:
45254 return false;
45255 // SSE signbit extraction.
45256 case X86ISD::MOVMSK:
45257 return false;
45258 // GFNI instructions.
45261 case X86ISD::GF2P8MULB:
45262 return false;
45264 switch (Op->getConstantOperandVal(0)) {
45265 case Intrinsic::x86_sse2_pmadd_wd:
45266 case Intrinsic::x86_avx2_pmadd_wd:
45267 case Intrinsic::x86_avx512_pmaddw_d_512:
45268 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45269 case Intrinsic::x86_avx2_pmadd_ub_sw:
45270 case Intrinsic::x86_avx512_pmaddubs_w_512:
45271 return false;
45272 case Intrinsic::x86_avx512_vpermi2var_d_128:
45273 case Intrinsic::x86_avx512_vpermi2var_d_256:
45274 case Intrinsic::x86_avx512_vpermi2var_d_512:
45275 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45276 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45277 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45278 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45279 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45280 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45281 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45282 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45283 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45284 case Intrinsic::x86_avx512_vpermi2var_q_128:
45285 case Intrinsic::x86_avx512_vpermi2var_q_256:
45286 case Intrinsic::x86_avx512_vpermi2var_q_512:
45287 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45288 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45289 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45290 return false;
45291 }
45292 }
45294 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45295}
45296
45298 const APInt &DemandedElts,
45299 APInt &UndefElts,
45300 const SelectionDAG &DAG,
45301 unsigned Depth) const {
45302 unsigned NumElts = DemandedElts.getBitWidth();
45303 unsigned Opc = Op.getOpcode();
45304
45305 switch (Opc) {
45306 case X86ISD::VBROADCAST:
45308 UndefElts = APInt::getZero(NumElts);
45309 return true;
45310 }
45311
45312 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45313 DAG, Depth);
45314}
45315
45316// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45317// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45318static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45319 bool AllowTruncate, unsigned Depth) {
45320 // Limit recursion.
45322 return false;
45323 switch (Src.getOpcode()) {
45324 case ISD::TRUNCATE:
45325 if (!AllowTruncate)
45326 return false;
45327 [[fallthrough]];
45328 case ISD::SETCC:
45329 return Src.getOperand(0).getValueSizeInBits() == Size;
45330 case ISD::FREEZE:
45331 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45332 Depth + 1);
45333 case ISD::AND:
45334 case ISD::XOR:
45335 case ISD::OR:
45336 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45337 Depth + 1) &&
45338 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45339 Depth + 1);
45340 case ISD::SELECT:
45341 case ISD::VSELECT:
45342 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45343 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45344 Depth + 1) &&
45345 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45346 Depth + 1);
45347 case ISD::BUILD_VECTOR:
45348 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45349 ISD::isBuildVectorAllOnes(Src.getNode());
45350 }
45351 return false;
45352}
45353
45354// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45355static unsigned getAltBitOpcode(unsigned Opcode) {
45356 switch(Opcode) {
45357 // clang-format off
45358 case ISD::AND: return X86ISD::FAND;
45359 case ISD::OR: return X86ISD::FOR;
45360 case ISD::XOR: return X86ISD::FXOR;
45361 case X86ISD::ANDNP: return X86ISD::FANDN;
45362 // clang-format on
45363 }
45364 llvm_unreachable("Unknown bitwise opcode");
45365}
45366
45367// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45369 const SDLoc &DL) {
45370 EVT SrcVT = Src.getValueType();
45371 if (SrcVT != MVT::v4i1)
45372 return SDValue();
45373
45374 switch (Src.getOpcode()) {
45375 case ISD::SETCC:
45376 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45377 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45378 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45379 SDValue Op0 = Src.getOperand(0);
45380 if (ISD::isNormalLoad(Op0.getNode()))
45381 return DAG.getBitcast(MVT::v4f32, Op0);
45382 if (Op0.getOpcode() == ISD::BITCAST &&
45383 Op0.getOperand(0).getValueType() == MVT::v4f32)
45384 return Op0.getOperand(0);
45385 }
45386 break;
45387 case ISD::AND:
45388 case ISD::XOR:
45389 case ISD::OR: {
45390 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45391 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45392 if (Op0 && Op1)
45393 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45394 Op1);
45395 break;
45396 }
45397 }
45398 return SDValue();
45399}
45400
45401// Helper to push sign extension of vXi1 SETCC result through bitops.
45403 SDValue Src, const SDLoc &DL) {
45404 switch (Src.getOpcode()) {
45405 case ISD::SETCC:
45406 case ISD::FREEZE:
45407 case ISD::TRUNCATE:
45408 case ISD::BUILD_VECTOR:
45409 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45410 case ISD::AND:
45411 case ISD::XOR:
45412 case ISD::OR:
45413 return DAG.getNode(
45414 Src.getOpcode(), DL, SExtVT,
45415 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45416 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45417 case ISD::SELECT:
45418 case ISD::VSELECT:
45419 return DAG.getSelect(
45420 DL, SExtVT, Src.getOperand(0),
45421 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45422 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45423 }
45424 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45425}
45426
45427// Try to match patterns such as
45428// (i16 bitcast (v16i1 x))
45429// ->
45430// (i16 movmsk (16i8 sext (v16i1 x)))
45431// before the illegal vector is scalarized on subtargets that don't have legal
45432// vxi1 types.
45434 const SDLoc &DL,
45435 const X86Subtarget &Subtarget) {
45436 EVT SrcVT = Src.getValueType();
45437 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45438 return SDValue();
45439
45440 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45441 // legalization destroys the v4i32 type.
45442 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45443 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45444 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45445 DAG.getBitcast(MVT::v4f32, V));
45446 return DAG.getZExtOrTrunc(V, DL, VT);
45447 }
45448 }
45449
45450 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45451 // movmskb even with avx512. This will be better than truncating to vXi1 and
45452 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45453 // vpcmpeqb/vpcmpgtb.
45454 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45455 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45456 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45457 Src.getOperand(0).getValueType() == MVT::v64i8);
45458
45459 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45460 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45461 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45462 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45463 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45464 EVT CmpVT = Src.getOperand(0).getValueType();
45465 EVT EltVT = CmpVT.getVectorElementType();
45466 if (CmpVT.getSizeInBits() <= 256 &&
45467 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45468 PreferMovMsk = true;
45469 }
45470
45471 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45472 // MOVMSK is supported in SSE2 or later.
45473 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45474 return SDValue();
45475
45476 // If the upper ops of a concatenation are undef, then try to bitcast the
45477 // lower op and extend.
45478 SmallVector<SDValue, 4> SubSrcOps;
45479 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45480 SubSrcOps.size() >= 2) {
45481 SDValue LowerOp = SubSrcOps[0];
45482 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45483 if (LowerOp.getOpcode() == ISD::SETCC &&
45484 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45485 EVT SubVT = VT.getIntegerVT(
45486 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45487 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45488 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45489 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45490 }
45491 }
45492 }
45493
45494 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45495 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45496 // v8i16 and v16i16.
45497 // For these two cases, we can shuffle the upper element bytes to a
45498 // consecutive sequence at the start of the vector and treat the results as
45499 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45500 // for v16i16 this is not the case, because the shuffle is expensive, so we
45501 // avoid sign-extending to this type entirely.
45502 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45503 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45504 MVT SExtVT;
45505 bool PropagateSExt = false;
45506 switch (SrcVT.getSimpleVT().SimpleTy) {
45507 default:
45508 return SDValue();
45509 case MVT::v2i1:
45510 SExtVT = MVT::v2i64;
45511 break;
45512 case MVT::v4i1:
45513 SExtVT = MVT::v4i32;
45514 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45515 // sign-extend to a 256-bit operation to avoid truncation.
45516 if (Subtarget.hasAVX() &&
45517 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45518 SExtVT = MVT::v4i64;
45519 PropagateSExt = true;
45520 }
45521 break;
45522 case MVT::v8i1:
45523 SExtVT = MVT::v8i16;
45524 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45525 // sign-extend to a 256-bit operation to match the compare.
45526 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45527 // 256-bit because the shuffle is cheaper than sign extending the result of
45528 // the compare.
45529 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45530 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45531 SExtVT = MVT::v8i32;
45532 PropagateSExt = true;
45533 }
45534 break;
45535 case MVT::v16i1:
45536 SExtVT = MVT::v16i8;
45537 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45538 // it is not profitable to sign-extend to 256-bit because this will
45539 // require an extra cross-lane shuffle which is more expensive than
45540 // truncating the result of the compare to 128-bits.
45541 break;
45542 case MVT::v32i1:
45543 SExtVT = MVT::v32i8;
45544 break;
45545 case MVT::v64i1:
45546 // If we have AVX512F, but not AVX512BW and the input is truncated from
45547 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45548 if (Subtarget.hasAVX512()) {
45549 if (Subtarget.hasBWI())
45550 return SDValue();
45551 SExtVT = MVT::v64i8;
45552 break;
45553 }
45554 // Split if this is a <64 x i8> comparison result.
45555 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45556 SExtVT = MVT::v64i8;
45557 break;
45558 }
45559 return SDValue();
45560 };
45561
45562 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45563 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45564
45565 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45566 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45567 } else {
45568 if (SExtVT == MVT::v8i16) {
45569 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45570 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45571 }
45572 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45573 }
45574
45575 EVT IntVT =
45577 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45578 return DAG.getBitcast(VT, V);
45579}
45580
45581// Convert a vXi1 constant build vector to the same width scalar integer.
45583 EVT SrcVT = Op.getValueType();
45584 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45585 "Expected a vXi1 vector");
45587 "Expected a constant build vector");
45588
45589 APInt Imm(SrcVT.getVectorNumElements(), 0);
45590 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45591 SDValue In = Op.getOperand(Idx);
45592 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45593 Imm.setBit(Idx);
45594 }
45595 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45596 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45597}
45598
45601 const X86Subtarget &Subtarget) {
45602 using namespace SDPatternMatch;
45603 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45604
45605 if (!DCI.isBeforeLegalizeOps())
45606 return SDValue();
45607
45608 // Only do this if we have k-registers.
45609 if (!Subtarget.hasAVX512())
45610 return SDValue();
45611
45612 EVT DstVT = N->getValueType(0);
45613 SDValue Op = N->getOperand(0);
45614 EVT SrcVT = Op.getValueType();
45615
45616 // Make sure we have a bitcast between mask registers and a scalar type.
45617 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45618 DstVT.isScalarInteger()) &&
45619 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45620 SrcVT.isScalarInteger()))
45621 return SDValue();
45622
45623 SDValue LHS, RHS;
45624
45625 // Look for logic ops.
45627 return SDValue();
45628
45629 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45630 // least one of the getBitcast() will fold away).
45631 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45633 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45634 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45635
45636 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45637 // Most of these have to move a constant from the scalar domain anyway.
45640 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45641 DAG.getBitcast(DstVT, LHS), RHS);
45642 }
45643
45644 return SDValue();
45645}
45646
45648 const X86Subtarget &Subtarget) {
45649 SDLoc DL(BV);
45650 unsigned NumElts = BV->getNumOperands();
45651 SDValue Splat = BV->getSplatValue();
45652
45653 // Build MMX element from integer GPR or SSE float values.
45654 auto CreateMMXElement = [&](SDValue V) {
45655 if (V.isUndef())
45656 return DAG.getUNDEF(MVT::x86mmx);
45657 if (V.getValueType().isFloatingPoint()) {
45658 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45659 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45660 V = DAG.getBitcast(MVT::v2i64, V);
45661 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45662 }
45663 V = DAG.getBitcast(MVT::i32, V);
45664 } else {
45665 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45666 }
45667 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45668 };
45669
45670 // Convert build vector ops to MMX data in the bottom elements.
45672
45673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45674
45675 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45676 if (Splat) {
45677 if (Splat.isUndef())
45678 return DAG.getUNDEF(MVT::x86mmx);
45679
45680 Splat = CreateMMXElement(Splat);
45681
45682 if (Subtarget.hasSSE1()) {
45683 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45684 if (NumElts == 8)
45685 Splat = DAG.getNode(
45686 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45687 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45688 TLI.getPointerTy(DAG.getDataLayout())),
45689 Splat, Splat);
45690
45691 // Use PSHUFW to repeat 16-bit elements.
45692 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45693 return DAG.getNode(
45694 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45695 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45696 TLI.getPointerTy(DAG.getDataLayout())),
45697 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45698 }
45699 Ops.append(NumElts, Splat);
45700 } else {
45701 for (unsigned i = 0; i != NumElts; ++i)
45702 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45703 }
45704
45705 // Use tree of PUNPCKLs to build up general MMX vector.
45706 while (Ops.size() > 1) {
45707 unsigned NumOps = Ops.size();
45708 unsigned IntrinOp =
45709 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45710 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45711 : Intrinsic::x86_mmx_punpcklbw));
45712 SDValue Intrin = DAG.getTargetConstant(
45713 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45714 for (unsigned i = 0; i != NumOps; i += 2)
45715 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45716 Ops[i], Ops[i + 1]);
45717 Ops.resize(NumOps / 2);
45718 }
45719
45720 return Ops[0];
45721}
45722
45723// Recursive function that attempts to find if a bool vector node was originally
45724// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45725// integer. If so, replace the scalar ops with bool vector equivalents back down
45726// the chain.
45728 SelectionDAG &DAG,
45729 const X86Subtarget &Subtarget,
45730 unsigned Depth = 0) {
45732 return SDValue(); // Limit search depth.
45733
45734 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45735 unsigned Opc = V.getOpcode();
45736 switch (Opc) {
45737 case ISD::BITCAST: {
45738 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45739 SDValue Src = V.getOperand(0);
45740 EVT SrcVT = Src.getValueType();
45741 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45742 return DAG.getBitcast(VT, Src);
45743 break;
45744 }
45745 case ISD::Constant: {
45746 auto *C = cast<ConstantSDNode>(V);
45747 if (C->isZero())
45748 return DAG.getConstant(0, DL, VT);
45749 if (C->isAllOnes())
45750 return DAG.getAllOnesConstant(DL, VT);
45751 break;
45752 }
45753 case ISD::TRUNCATE: {
45754 // If we find a suitable source, a truncated scalar becomes a subvector.
45755 SDValue Src = V.getOperand(0);
45756 EVT NewSrcVT =
45757 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45758 if (TLI.isTypeLegal(NewSrcVT))
45759 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45760 Subtarget, Depth + 1))
45761 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45762 DAG.getVectorIdxConstant(0, DL));
45763 break;
45764 }
45765 case ISD::ANY_EXTEND:
45766 case ISD::ZERO_EXTEND: {
45767 // If we find a suitable source, an extended scalar becomes a subvector.
45768 SDValue Src = V.getOperand(0);
45769 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45770 Src.getScalarValueSizeInBits());
45771 if (TLI.isTypeLegal(NewSrcVT))
45772 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45773 Subtarget, Depth + 1))
45774 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45775 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45776 : DAG.getConstant(0, DL, VT),
45777 N0, DAG.getVectorIdxConstant(0, DL));
45778 break;
45779 }
45780 case ISD::OR:
45781 case ISD::XOR: {
45782 // If we find suitable sources, we can just move the op to the vector
45783 // domain.
45784 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45785 Subtarget, Depth + 1))
45786 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45787 Subtarget, Depth + 1))
45788 return DAG.getNode(Opc, DL, VT, N0, N1);
45789 break;
45790 }
45791 case ISD::SHL: {
45792 // If we find a suitable source, a SHL becomes a KSHIFTL.
45793 SDValue Src0 = V.getOperand(0);
45794 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45795 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45796 break;
45797
45798 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45799 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45800 Depth + 1))
45801 return DAG.getNode(
45802 X86ISD::KSHIFTL, DL, VT, N0,
45803 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45804 break;
45805 }
45806 }
45807
45808 // Does the inner bitcast already exist?
45809 if (Depth > 0)
45810 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45811 return SDValue(Alt, 0);
45812
45813 return SDValue();
45814}
45815
45818 const X86Subtarget &Subtarget) {
45819 SDValue N0 = N->getOperand(0);
45820 EVT VT = N->getValueType(0);
45821 EVT SrcVT = N0.getValueType();
45822 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45823
45824 // Try to match patterns such as
45825 // (i16 bitcast (v16i1 x))
45826 // ->
45827 // (i16 movmsk (16i8 sext (v16i1 x)))
45828 // before the setcc result is scalarized on subtargets that don't have legal
45829 // vxi1 types.
45830 if (DCI.isBeforeLegalize()) {
45831 SDLoc dl(N);
45832 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45833 return V;
45834
45835 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45836 // type, widen both sides to avoid a trip through memory.
45837 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45838 Subtarget.hasAVX512()) {
45839 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45840 N0 = DAG.getBitcast(MVT::v8i1, N0);
45841 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45842 DAG.getVectorIdxConstant(0, dl));
45843 }
45844
45845 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45846 // type, widen both sides to avoid a trip through memory.
45847 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45848 Subtarget.hasAVX512()) {
45849 // Use zeros for the widening if we already have some zeroes. This can
45850 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45851 // stream of this.
45852 // FIXME: It might make sense to detect a concat_vectors with a mix of
45853 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45854 // a separate combine. What we can't do is canonicalize the operands of
45855 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45856 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45857 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45858 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45859 SrcVT = LastOp.getValueType();
45860 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45862 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45863 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45864 N0 = DAG.getBitcast(MVT::i8, N0);
45865 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45866 }
45867 }
45868
45869 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45870 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45871 Ops[0] = N0;
45872 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45873 N0 = DAG.getBitcast(MVT::i8, N0);
45874 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45875 }
45876 } else if (DCI.isAfterLegalizeDAG()) {
45877 // If we're bitcasting from iX to vXi1, see if the integer originally
45878 // began as a vXi1 and whether we can remove the bitcast entirely.
45879 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45880 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45881 if (SDValue V =
45882 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45883 return V;
45884 }
45885 }
45886
45887 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45888 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45889 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45890 // we can help with known bits propagation from the vXi1 domain to the
45891 // scalar domain.
45892 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45893 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45894 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45896 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45897 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45898
45899 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45900 // and the vbroadcast_load are both integer or both fp. In some cases this
45901 // will remove the bitcast entirely.
45902 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45903 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45904 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45905 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45906 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45907 // Don't swap i8/i16 since don't have fp types that size.
45908 if (MemSize >= 32) {
45909 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45910 : MVT::getIntegerVT(MemSize);
45911 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45912 : MVT::getIntegerVT(SrcVTSize);
45913 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45914
45915 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45916 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45917 SDValue ResNode =
45919 MemVT, BCast->getMemOperand());
45920 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45921 return DAG.getBitcast(VT, ResNode);
45922 }
45923 }
45924
45925 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45926 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45927 SDValue Src = peekThroughTruncates(N0);
45928 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45929 Src.getOperand(0).getValueSizeInBits() == 128 &&
45930 isNullConstant(Src.getOperand(1))) {
45931 SDLoc DL(N);
45932 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45933 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45934 DAG.getVectorIdxConstant(0, DL));
45935 }
45936 }
45937
45938 // Since MMX types are special and don't usually play with other vector types,
45939 // it's better to handle them early to be sure we emit efficient code by
45940 // avoiding store-load conversions.
45941 if (VT == MVT::x86mmx) {
45942 // Detect MMX constant vectors.
45943 APInt UndefElts;
45944 SmallVector<APInt, 1> EltBits;
45945 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45946 /*AllowWholeUndefs*/ true,
45947 /*AllowPartialUndefs*/ true)) {
45948 SDLoc DL(N0);
45949 // Handle zero-extension of i32 with MOVD.
45950 if (EltBits[0].countl_zero() >= 32)
45951 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45952 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45953 // Else, bitcast to a double.
45954 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45955 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45956 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45957 }
45958
45959 // Detect bitcasts to x86mmx low word.
45960 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45961 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45962 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45963 bool LowUndef = true, AllUndefOrZero = true;
45964 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45965 SDValue Op = N0.getOperand(i);
45966 LowUndef &= Op.isUndef() || (i >= e/2);
45967 AllUndefOrZero &= isNullConstantOrUndef(Op);
45968 }
45969 if (AllUndefOrZero) {
45970 SDValue N00 = N0.getOperand(0);
45971 SDLoc dl(N00);
45972 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45973 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45974 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45975 }
45976 }
45977
45978 // Detect bitcasts of 64-bit build vectors and convert to a
45979 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45980 // lowest element.
45981 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45982 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45983 SrcVT == MVT::v8i8))
45984 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45985
45986 // Detect bitcasts between element or subvector extraction to x86mmx.
45987 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45989 isNullConstant(N0.getOperand(1))) {
45990 SDValue N00 = N0.getOperand(0);
45991 if (N00.getValueType().is128BitVector())
45992 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45993 DAG.getBitcast(MVT::v2i64, N00));
45994 }
45995
45996 // Detect bitcasts from FP_TO_SINT to x86mmx.
45997 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45998 SDLoc DL(N0);
45999 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46000 DAG.getUNDEF(MVT::v2i32));
46001 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46002 DAG.getBitcast(MVT::v2i64, Res));
46003 }
46004 }
46005
46006 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46007 // most of these to scalar anyway.
46008 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46009 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46011 return combinevXi1ConstantToInteger(N0, DAG);
46012 }
46013
46014 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46015 VT.getVectorElementType() == MVT::i1) {
46016 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46017 if (C->isAllOnes())
46018 return DAG.getConstant(1, SDLoc(N0), VT);
46019 if (C->isZero())
46020 return DAG.getConstant(0, SDLoc(N0), VT);
46021 }
46022 }
46023
46024 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46025 // Turn it into a sign bit compare that produces a k-register. This avoids
46026 // a trip through a GPR.
46027 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46028 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46030 unsigned NumElts = VT.getVectorNumElements();
46031 SDValue Src = N0;
46032
46033 // Peek through truncate.
46034 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46035 Src = N0.getOperand(0);
46036
46037 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46038 SDValue MovmskIn = Src.getOperand(0);
46039 MVT MovmskVT = MovmskIn.getSimpleValueType();
46040 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46041
46042 // We allow extra bits of the movmsk to be used since they are known zero.
46043 // We can't convert a VPMOVMSKB without avx512bw.
46044 if (MovMskElts <= NumElts &&
46045 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46046 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46047 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46048 SDLoc dl(N);
46049 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46050 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46051 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46052 if (EVT(CmpVT) == VT)
46053 return Cmp;
46054
46055 // Pad with zeroes up to original VT to replace the zeroes that were
46056 // being used from the MOVMSK.
46057 unsigned NumConcats = NumElts / MovMskElts;
46058 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46059 Ops[0] = Cmp;
46060 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46061 }
46062 }
46063 }
46064
46065 // Try to remove bitcasts from input and output of mask arithmetic to
46066 // remove GPR<->K-register crossings.
46067 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46068 return V;
46069
46070 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46071 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46072 SrcVT.getVectorNumElements() == 1)
46073 return N0.getOperand(1);
46074
46075 // Convert a bitcasted integer logic operation that has one bitcasted
46076 // floating-point operand into a floating-point logic operation. This may
46077 // create a load of a constant, but that is cheaper than materializing the
46078 // constant in an integer register and transferring it to an SSE register or
46079 // transferring the SSE operand to integer register and back.
46080 unsigned FPOpcode;
46081 switch (N0.getOpcode()) {
46082 // clang-format off
46083 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46084 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46085 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46086 default: return SDValue();
46087 // clang-format on
46088 }
46089
46090 // Check if we have a bitcast from another integer type as well.
46091 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46092 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46093 (Subtarget.hasFP16() && VT == MVT::f16) ||
46094 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46095 TLI.isTypeLegal(VT))))
46096 return SDValue();
46097
46098 SDValue LogicOp0 = N0.getOperand(0);
46099 SDValue LogicOp1 = N0.getOperand(1);
46100 SDLoc DL0(N0);
46101
46102 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46103 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46104 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46105 LogicOp0.getOperand(0).getValueType() == VT &&
46106 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46107 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46108 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46109 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46110 }
46111 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46112 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46113 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46114 LogicOp1.getOperand(0).getValueType() == VT &&
46115 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46116 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46117 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46118 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46119 }
46120
46121 return SDValue();
46122}
46123
46124// (mul (zext a), (sext, b))
46125static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46126 SDValue &Op1) {
46127 Op0 = Mul.getOperand(0);
46128 Op1 = Mul.getOperand(1);
46129
46130 // The operand1 should be signed extend
46131 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46132 std::swap(Op0, Op1);
46133
46134 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46135 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46136 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46137 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46138 return true;
46139
46140 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46141 return (BV && BV->isConstant());
46142 };
46143
46144 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46145 // value, we need to check Op0 is zero extended value. Op1 should be signed
46146 // value, so we just check the signed bits.
46147 if ((IsFreeTruncation(Op0) &&
46148 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46149 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46150 return true;
46151
46152 return false;
46153}
46154
46156 unsigned &LogBias, const SDLoc &DL,
46157 const X86Subtarget &Subtarget) {
46158 // Extend or truncate to MVT::i8 first.
46159 MVT Vi8VT =
46160 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46161 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46162 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46163
46164 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46165 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46166 // The src A, B element type is i8, but the dst C element type is i32.
46167 // When we calculate the reduce stage, we use src vector type vXi8 for it
46168 // so we need logbias 2 to avoid extra 2 stages.
46169 LogBias = 2;
46170
46171 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46172 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46173 RegSize = std::max(512u, RegSize);
46174
46175 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46176 // fill in the missing vector elements with 0.
46177 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46178 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46179 Ops[0] = LHS;
46180 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46181 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46182 Ops[0] = RHS;
46183 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46184
46185 // Actually build the DotProduct, split as 256/512 bits for
46186 // AVXVNNI/AVX512VNNI.
46187 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46189 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46190 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46191 };
46192 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46193 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46194
46195 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46196 DpBuilder, false);
46197}
46198
46199// Create a PSADBW given two sources representable as zexts of vXi8.
46201 const SDLoc &DL, const X86Subtarget &Subtarget) {
46202 // Find the appropriate width for the PSADBW.
46203 EVT DstVT = N0.getValueType();
46204 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46205 DstVT.getVectorElementCount());
46206 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46207
46208 // Widen the vXi8 vectors, padding with zero vector elements.
46209 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46210 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46211 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46212 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46213 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46214 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46215 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46216
46217 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46218 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46220 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46221 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46222 };
46223 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46224 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46225 PSADBWBuilder);
46226}
46227
46228// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46229// PHMINPOSUW.
46231 const X86Subtarget &Subtarget) {
46232 // Bail without SSE41.
46233 if (!Subtarget.hasSSE41())
46234 return SDValue();
46235
46236 EVT ExtractVT = Extract->getValueType(0);
46237 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46238 return SDValue();
46239
46240 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46241 ISD::NodeType BinOp;
46242 SDValue Src = DAG.matchBinOpReduction(
46243 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46244 if (!Src)
46245 return SDValue();
46246
46247 EVT SrcVT = Src.getValueType();
46248 EVT SrcSVT = SrcVT.getScalarType();
46249 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46250 return SDValue();
46251
46252 SDLoc DL(Extract);
46253 SDValue MinPos = Src;
46254
46255 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46256 while (SrcVT.getSizeInBits() > 128) {
46257 SDValue Lo, Hi;
46258 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46259 SrcVT = Lo.getValueType();
46260 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46261 }
46262 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46263 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46264 "Unexpected value type");
46265
46266 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46267 // to flip the value accordingly.
46268 SDValue Mask;
46269 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46270 if (BinOp == ISD::SMAX)
46271 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46272 else if (BinOp == ISD::SMIN)
46273 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46274 else if (BinOp == ISD::UMAX)
46275 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46276
46277 if (Mask)
46278 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46279
46280 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46281 // shuffling each upper element down and insert zeros. This means that the
46282 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46283 // ready for the PHMINPOS.
46284 if (ExtractVT == MVT::i8) {
46286 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46287 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46288 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46289 }
46290
46291 // Perform the PHMINPOS on a v8i16 vector,
46292 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46293 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46294 MinPos = DAG.getBitcast(SrcVT, MinPos);
46295
46296 if (Mask)
46297 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46298
46299 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46300 DAG.getVectorIdxConstant(0, DL));
46301}
46302
46303// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46305 const X86Subtarget &Subtarget) {
46306 // Bail without SSE2.
46307 if (!Subtarget.hasSSE2())
46308 return SDValue();
46309
46310 EVT ExtractVT = Extract->getValueType(0);
46311 unsigned BitWidth = ExtractVT.getSizeInBits();
46312 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46313 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46314 return SDValue();
46315
46316 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46317 ISD::NodeType BinOp;
46318 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46319 if (!Match && ExtractVT == MVT::i1)
46320 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46321 if (!Match)
46322 return SDValue();
46323
46324 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46325 // which we can't support here for now.
46326 if (Match.getScalarValueSizeInBits() != BitWidth)
46327 return SDValue();
46328
46329 SDValue Movmsk;
46330 SDLoc DL(Extract);
46331 EVT MatchVT = Match.getValueType();
46332 unsigned NumElts = MatchVT.getVectorNumElements();
46333 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46334 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46335 LLVMContext &Ctx = *DAG.getContext();
46336
46337 if (ExtractVT == MVT::i1) {
46338 // Special case for (pre-legalization) vXi1 reductions.
46339 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46340 return SDValue();
46341 if (Match.getOpcode() == ISD::SETCC) {
46342 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46343 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46344 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46345 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46346 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46347 X86::CondCode X86CC;
46348 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46349 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46350 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46351 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46352 DAG, X86CC))
46353 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46354 getSETCC(X86CC, V, DL, DAG));
46355 }
46356 }
46357 if (TLI.isTypeLegal(MatchVT)) {
46358 // If this is a legal AVX512 predicate type then we can just bitcast.
46359 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46360 Movmsk = DAG.getBitcast(MovmskVT, Match);
46361 } else {
46362 // Use combineBitcastvxi1 to create the MOVMSK.
46363 while (NumElts > MaxElts) {
46364 SDValue Lo, Hi;
46365 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46366 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46367 NumElts /= 2;
46368 }
46369 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46370 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46371 }
46372 if (!Movmsk)
46373 return SDValue();
46374 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46375 } else {
46376 // FIXME: Better handling of k-registers or 512-bit vectors?
46377 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46378 if (!(MatchSizeInBits == 128 ||
46379 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46380 return SDValue();
46381
46382 // Make sure this isn't a vector of 1 element. The perf win from using
46383 // MOVMSK diminishes with less elements in the reduction, but it is
46384 // generally better to get the comparison over to the GPRs as soon as
46385 // possible to reduce the number of vector ops.
46386 if (Match.getValueType().getVectorNumElements() < 2)
46387 return SDValue();
46388
46389 // Check that we are extracting a reduction of all sign bits.
46390 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46391 return SDValue();
46392
46393 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46394 SDValue Lo, Hi;
46395 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46396 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46397 MatchSizeInBits = Match.getValueSizeInBits();
46398 }
46399
46400 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46401 MVT MaskSrcVT;
46402 if (64 == BitWidth || 32 == BitWidth)
46404 MatchSizeInBits / BitWidth);
46405 else
46406 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46407
46408 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46409 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46410 NumElts = MaskSrcVT.getVectorNumElements();
46411 }
46412 assert((NumElts <= 32 || NumElts == 64) &&
46413 "Not expecting more than 64 elements");
46414
46415 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46416 if (BinOp == ISD::XOR) {
46417 // parity -> (PARITY(MOVMSK X))
46418 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46419 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46420 }
46421
46422 SDValue CmpC;
46423 ISD::CondCode CondCode;
46424 if (BinOp == ISD::OR) {
46425 // any_of -> MOVMSK != 0
46426 CmpC = DAG.getConstant(0, DL, CmpVT);
46427 CondCode = ISD::CondCode::SETNE;
46428 } else {
46429 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46430 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46431 DL, CmpVT);
46432 CondCode = ISD::CondCode::SETEQ;
46433 }
46434
46435 // The setcc produces an i8 of 0/1, so extend that to the result width and
46436 // negate to get the final 0/-1 mask value.
46437 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46438 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46439 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46440 return DAG.getNegative(Zext, DL, ExtractVT);
46441}
46442
46444 const X86Subtarget &Subtarget) {
46445 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46446 return SDValue();
46447
46448 EVT ExtractVT = Extract->getValueType(0);
46449 // Verify the type we're extracting is i32, as the output element type of
46450 // vpdpbusd is i32.
46451 if (ExtractVT != MVT::i32)
46452 return SDValue();
46453
46454 EVT VT = Extract->getOperand(0).getValueType();
46456 return SDValue();
46457
46458 // Match shuffle + add pyramid.
46459 ISD::NodeType BinOp;
46460 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46461
46462 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46463 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46464 // before adding into the accumulator.
46465 // TODO:
46466 // We also need to verify that the multiply has at least 2x the number of bits
46467 // of the input. We shouldn't match
46468 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46469 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46470 // Root = Root.getOperand(0);
46471
46472 // If there was a match, we want Root to be a mul.
46473 if (!Root || Root.getOpcode() != ISD::MUL)
46474 return SDValue();
46475
46476 // Check whether we have an extend and mul pattern
46477 SDValue LHS, RHS;
46478 if (!detectExtMul(DAG, Root, LHS, RHS))
46479 return SDValue();
46480
46481 // Create the dot product instruction.
46482 SDLoc DL(Extract);
46483 unsigned StageBias;
46484 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46485
46486 // If the original vector was wider than 4 elements, sum over the results
46487 // in the DP vector.
46488 unsigned Stages = Log2_32(VT.getVectorNumElements());
46489 EVT DpVT = DP.getValueType();
46490
46491 if (Stages > StageBias) {
46492 unsigned DpElems = DpVT.getVectorNumElements();
46493
46494 for (unsigned i = Stages - StageBias; i > 0; --i) {
46495 SmallVector<int, 16> Mask(DpElems, -1);
46496 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46497 Mask[j] = MaskEnd + j;
46498
46499 SDValue Shuffle =
46500 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46501 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46502 }
46503 }
46504
46505 // Return the lowest ExtractSizeInBits bits.
46506 EVT ResVT =
46507 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46508 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46509 DP = DAG.getBitcast(ResVT, DP);
46510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46511 Extract->getOperand(1));
46512}
46513
46515 const X86Subtarget &Subtarget) {
46516 using namespace SDPatternMatch;
46517
46518 // PSADBW is only supported on SSE2 and up.
46519 if (!Subtarget.hasSSE2())
46520 return SDValue();
46521
46522 EVT ExtractVT = Extract->getValueType(0);
46523 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46524 ExtractVT != MVT::i64)
46525 return SDValue();
46526
46527 EVT VT = Extract->getOperand(0).getValueType();
46529 return SDValue();
46530
46531 // Match shuffle + add pyramid.
46532 ISD::NodeType BinOp;
46533 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46534 if (!Root)
46535 return SDValue();
46536
46537 // The operand is expected to be zero extended from i8.
46538 // In order to convert to i64 and above, additional any/zero/sign
46539 // extend is expected.
46540 // The zero extend from 32 bit has no mathematical effect on the result.
46541 // Also the sign extend is basically zero extend
46542 // (extends the sign bit which is zero).
46543 // So it is correct to skip the sign/zero extend instruction.
46544 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46545 Root.getOpcode() == ISD::ZERO_EXTEND ||
46546 Root.getOpcode() == ISD::ANY_EXTEND)
46547 Root = Root.getOperand(0);
46548
46549 // Check whether we have an vXi8 abdu pattern.
46550 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46551 SDValue Src0, Src1;
46552 if (!sd_match(
46553 Root,
46554 m_AnyOf(
46556 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46558 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46559 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46560 m_Abs(
46561 m_Sub(m_AllOf(m_Value(Src0),
46563 m_AllOf(m_Value(Src1),
46564 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46565 return SDValue();
46566
46567 // Create the SAD instruction.
46568 SDLoc DL(Extract);
46569 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46570
46571 // If the original vector was wider than 8 elements, sum over the results
46572 // in the SAD vector.
46573 unsigned Stages = Log2_32(VT.getVectorNumElements());
46574 EVT SadVT = SAD.getValueType();
46575 if (Stages > 3) {
46576 unsigned SadElems = SadVT.getVectorNumElements();
46577
46578 for(unsigned i = Stages - 3; i > 0; --i) {
46579 SmallVector<int, 16> Mask(SadElems, -1);
46580 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46581 Mask[j] = MaskEnd + j;
46582
46583 SDValue Shuffle =
46584 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46585 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46586 }
46587 }
46588
46589 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46590 // Return the lowest ExtractSizeInBits bits.
46591 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46592 SadVT.getSizeInBits() / ExtractSizeInBits);
46593 SAD = DAG.getBitcast(ResVT, SAD);
46594 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46595 Extract->getOperand(1));
46596}
46597
46598// If this extract is from a loaded vector value and will be used as an
46599// integer, that requires a potentially expensive XMM -> GPR transfer.
46600// Additionally, if we can convert to a scalar integer load, that will likely
46601// be folded into a subsequent integer op.
46602// Note: SrcVec might not have a VecVT type, but it must be the same size.
46603// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46604// to a single-use of the loaded vector. For the reasons above, we
46605// expect this to be profitable even if it creates an extra load.
46606static SDValue
46608 const SDLoc &dl, SelectionDAG &DAG,
46610 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46611 "Only EXTRACT_VECTOR_ELT supported so far");
46612
46613 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46614 EVT VT = N->getValueType(0);
46615
46616 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46617 return Use->getOpcode() == ISD::STORE ||
46618 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46619 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46620 });
46621
46622 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46623 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46624 VecVT.getVectorElementType() == VT &&
46625 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46626 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46627 SDValue NewPtr = TLI.getVectorElementPointer(
46628 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46629 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46630 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46631 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46632 SDValue Load =
46633 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46634 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46635 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46636 return Load;
46637 }
46638
46639 return SDValue();
46640}
46641
46642// Attempt to peek through a target shuffle and extract the scalar from the
46643// source.
46646 const X86Subtarget &Subtarget) {
46647 if (DCI.isBeforeLegalizeOps())
46648 return SDValue();
46649
46650 SDLoc dl(N);
46651 SDValue Src = N->getOperand(0);
46652 SDValue Idx = N->getOperand(1);
46653
46654 EVT VT = N->getValueType(0);
46655 EVT SrcVT = Src.getValueType();
46656 EVT SrcSVT = SrcVT.getVectorElementType();
46657 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46658 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46659
46660 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46661 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46662 return SDValue();
46663
46664 const APInt &IdxC = N->getConstantOperandAPInt(1);
46665 if (IdxC.uge(NumSrcElts))
46666 return SDValue();
46667
46668 SDValue SrcBC = peekThroughBitcasts(Src);
46669
46670 // Handle extract(bitcast(broadcast(scalar_value))).
46671 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46672 SDValue SrcOp = SrcBC.getOperand(0);
46673 EVT SrcOpVT = SrcOp.getValueType();
46674 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46675 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46676 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46677 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46678 // TODO support non-zero offsets.
46679 if (Offset == 0) {
46680 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46681 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46682 return SrcOp;
46683 }
46684 }
46685 }
46686
46687 // If we're extracting a single element from a broadcast load and there are
46688 // no other users, just create a single load.
46690 SrcBC.hasOneUse()) {
46691 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46692 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46693 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46694 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46695 SDValue Load =
46696 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46697 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46698 MemIntr->getMemOperand()->getFlags());
46699 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46700 return Load;
46701 }
46702 }
46703
46704 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46705 // TODO: Move to DAGCombine?
46706 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46707 SrcBC.getValueType().isInteger() &&
46708 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46709 SrcBC.getScalarValueSizeInBits() ==
46710 SrcBC.getOperand(0).getValueSizeInBits()) {
46711 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46712 if (IdxC.ult(Scale)) {
46713 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46714 SDValue Scl = SrcBC.getOperand(0);
46715 EVT SclVT = Scl.getValueType();
46716 if (Offset) {
46717 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46718 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46719 }
46720 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46721 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46722 return Scl;
46723 }
46724 }
46725
46726 // Handle extract(truncate(x)) for 0'th index.
46727 // TODO: Treat this as a faux shuffle?
46728 // TODO: When can we use this for general indices?
46729 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46730 (SrcVT.getSizeInBits() % 128) == 0) {
46731 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46732 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46733 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46734 Idx);
46735 }
46736
46737 // We can only legally extract other elements from 128-bit vectors and in
46738 // certain circumstances, depending on SSE-level.
46739 // TODO: Investigate float/double extraction if it will be just stored.
46740 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46741 unsigned Idx) {
46742 EVT VecSVT = VecVT.getScalarType();
46743 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46744 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46745 VecSVT == MVT::i64)) {
46746 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46747 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46748 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46749 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46750 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46751 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46752 Idx &= (NumEltsPerLane - 1);
46753 }
46754 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46755 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46756 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46757 DAG.getBitcast(VecVT, Vec),
46758 DAG.getVectorIdxConstant(Idx, dl));
46759 }
46760 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46761 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46762 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46763 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46764 DAG.getTargetConstant(Idx, dl, MVT::i8));
46765 }
46766 return SDValue();
46767 };
46768
46769 // Resolve the target shuffle inputs and mask.
46772 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46773 return SDValue();
46774
46775 // Shuffle inputs must be the same size as the result.
46776 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46777 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46778 }))
46779 return SDValue();
46780
46781 // Attempt to narrow/widen the shuffle mask to the correct size.
46782 if (Mask.size() != NumSrcElts) {
46783 if ((NumSrcElts % Mask.size()) == 0) {
46784 SmallVector<int, 16> ScaledMask;
46785 int Scale = NumSrcElts / Mask.size();
46786 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46787 Mask = std::move(ScaledMask);
46788 } else if ((Mask.size() % NumSrcElts) == 0) {
46789 // Simplify Mask based on demanded element.
46790 int ExtractIdx = (int)IdxC.getZExtValue();
46791 int Scale = Mask.size() / NumSrcElts;
46792 int Lo = Scale * ExtractIdx;
46793 int Hi = Scale * (ExtractIdx + 1);
46794 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46795 if (i < Lo || Hi <= i)
46796 Mask[i] = SM_SentinelUndef;
46797
46798 SmallVector<int, 16> WidenedMask;
46799 while (Mask.size() > NumSrcElts &&
46800 canWidenShuffleElements(Mask, WidenedMask))
46801 Mask = std::move(WidenedMask);
46802 }
46803 }
46804
46805 // If narrowing/widening failed, see if we can extract+zero-extend.
46806 int ExtractIdx;
46807 EVT ExtractVT;
46808 if (Mask.size() == NumSrcElts) {
46809 ExtractIdx = Mask[IdxC.getZExtValue()];
46810 ExtractVT = SrcVT;
46811 } else {
46812 unsigned Scale = Mask.size() / NumSrcElts;
46813 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46814 return SDValue();
46815 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46816 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46817 return SDValue();
46818 ExtractIdx = Mask[ScaledIdx];
46819 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46820 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46821 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46822 "Failed to widen vector type");
46823 }
46824
46825 // If the shuffle source element is undef/zero then we can just accept it.
46826 if (ExtractIdx == SM_SentinelUndef)
46827 return DAG.getUNDEF(VT);
46828
46829 if (ExtractIdx == SM_SentinelZero)
46830 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46831 : DAG.getConstant(0, dl, VT);
46832
46833 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46834 ExtractIdx = ExtractIdx % Mask.size();
46835 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46836 return DAG.getZExtOrTrunc(V, dl, VT);
46837
46838 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46840 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46841 return V;
46842
46843 return SDValue();
46844}
46845
46846/// Extracting a scalar FP value from vector element 0 is free, so extract each
46847/// operand first, then perform the math as a scalar op.
46849 const X86Subtarget &Subtarget,
46851 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46852 SDValue Vec = ExtElt->getOperand(0);
46853 SDValue Index = ExtElt->getOperand(1);
46854 EVT VT = ExtElt->getValueType(0);
46855 EVT VecVT = Vec.getValueType();
46856
46857 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46858 // non-zero element because the shuffle+scalar op will be cheaper?
46859 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46860 return SDValue();
46861
46862 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46863 // extract, the condition code), so deal with those as a special-case.
46864 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46865 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46866 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46867 return SDValue();
46868
46869 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46870 SDLoc DL(ExtElt);
46871 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46872 Vec.getOperand(0), Index);
46873 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46874 Vec.getOperand(1), Index);
46875 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46876 }
46877
46878 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46879 VT != MVT::f64)
46880 return SDValue();
46881
46882 // Vector FP selects don't fit the pattern of FP math ops (because the
46883 // condition has a different type and we have to change the opcode), so deal
46884 // with those here.
46885 // FIXME: This is restricted to pre type legalization. If we loosen this we
46886 // need to convert vector bool to a scalar bool.
46887 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46888 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46889 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46890 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46891 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46892 SDLoc DL(ExtElt);
46895 Vec.getOperand(0), Index);
46896 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46897 Vec.getOperand(1), Index);
46898 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46899 Vec.getOperand(2), Index);
46900 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46901 }
46902
46903 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46904 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46905 // missed load folding and fma+fneg combining.
46906 switch (Vec.getOpcode()) {
46907 case ISD::FMA: // Begin 3 operands
46908 case ISD::FMAD:
46909 case ISD::FADD: // Begin 2 operands
46910 case ISD::FSUB:
46911 case ISD::FMUL:
46912 case ISD::FDIV:
46913 case ISD::FREM:
46914 case ISD::FCOPYSIGN:
46915 case ISD::FMINNUM:
46916 case ISD::FMAXNUM:
46917 case ISD::FMINNUM_IEEE:
46918 case ISD::FMAXNUM_IEEE:
46919 case ISD::FMAXIMUM:
46920 case ISD::FMINIMUM:
46921 case ISD::FMAXIMUMNUM:
46922 case ISD::FMINIMUMNUM:
46923 case X86ISD::FMAX:
46924 case X86ISD::FMIN:
46925 case ISD::FABS: // Begin 1 operand
46926 case ISD::FSQRT:
46927 case ISD::FRINT:
46928 case ISD::FCEIL:
46929 case ISD::FTRUNC:
46930 case ISD::FNEARBYINT:
46931 case ISD::FROUNDEVEN:
46932 case ISD::FROUND:
46933 case ISD::FFLOOR:
46934 case X86ISD::FRCP:
46935 case X86ISD::FRSQRT: {
46936 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46937 SDLoc DL(ExtElt);
46939 for (SDValue Op : Vec->ops())
46940 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46941 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46942 }
46943 default:
46944 return SDValue();
46945 }
46946 llvm_unreachable("All opcodes should return within switch");
46947}
46948
46949/// Try to convert a vector reduction sequence composed of binops and shuffles
46950/// into horizontal ops.
46952 const X86Subtarget &Subtarget) {
46953 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46954
46955 // We need at least SSE2 to anything here.
46956 if (!Subtarget.hasSSE2())
46957 return SDValue();
46958
46960 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46961 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46962 if (!Rdx)
46963 return SDValue();
46964
46965 SDValue Index = ExtElt->getOperand(1);
46966 assert(isNullConstant(Index) &&
46967 "Reduction doesn't end in an extract from index 0");
46968
46969 EVT VT = ExtElt->getValueType(0);
46970 EVT VecVT = Rdx.getValueType();
46971 if (VecVT.getScalarType() != VT)
46972 return SDValue();
46973
46974 SDLoc DL(ExtElt);
46975 unsigned NumElts = VecVT.getVectorNumElements();
46976 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46977
46978 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46979 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46980 if (V.getValueType() == MVT::v4i8) {
46981 if (ZeroExtend && Subtarget.hasSSE41()) {
46982 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46983 DAG.getConstant(0, DL, MVT::v4i32),
46984 DAG.getBitcast(MVT::i32, V),
46985 DAG.getVectorIdxConstant(0, DL));
46986 return DAG.getBitcast(MVT::v16i8, V);
46987 }
46988 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46989 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46990 : DAG.getUNDEF(MVT::v4i8));
46991 }
46992 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46993 DAG.getUNDEF(MVT::v8i8));
46994 };
46995
46996 // vXi8 mul reduction - promote to vXi16 mul reduction.
46997 if (Opc == ISD::MUL) {
46998 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46999 return SDValue();
47000 if (VecVT.getSizeInBits() >= 128) {
47001 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47002 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47003 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47004 Lo = DAG.getBitcast(WideVT, Lo);
47005 Hi = DAG.getBitcast(WideVT, Hi);
47006 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47007 while (Rdx.getValueSizeInBits() > 128) {
47008 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47009 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47010 }
47011 } else {
47012 Rdx = WidenToV16I8(Rdx, false);
47013 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47014 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47015 }
47016 if (NumElts >= 8)
47017 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47018 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47019 {4, 5, 6, 7, -1, -1, -1, -1}));
47020 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47021 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47022 {2, 3, -1, -1, -1, -1, -1, -1}));
47023 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47024 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47025 {1, -1, -1, -1, -1, -1, -1, -1}));
47026 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47027 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47028 }
47029
47030 // vXi8 add reduction - sub 128-bit vector.
47031 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47032 Rdx = WidenToV16I8(Rdx, true);
47033 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47034 DAG.getConstant(0, DL, MVT::v16i8));
47035 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47036 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47037 }
47038
47039 // Must be a >=128-bit vector with pow2 elements.
47040 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47041 return SDValue();
47042
47043 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47044 if (VT == MVT::i8) {
47045 while (Rdx.getValueSizeInBits() > 128) {
47046 SDValue Lo, Hi;
47047 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47048 VecVT = Lo.getValueType();
47049 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47050 }
47051 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47052
47054 MVT::v16i8, DL, Rdx, Rdx,
47055 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47056 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47057 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47058 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47059 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47060 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47061 }
47062
47063 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47064 // If the source vector values are 0-255, then we can use PSADBW to
47065 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47066 // TODO: See if its worth avoiding vXi16/i32 truncations?
47067 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47068 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47069 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47070 Subtarget.hasAVX512())) {
47071 if (Rdx.getValueType() == MVT::v8i16) {
47072 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47073 DAG.getUNDEF(MVT::v8i16));
47074 } else {
47075 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47076 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47077 if (ByteVT.getSizeInBits() < 128)
47078 Rdx = WidenToV16I8(Rdx, true);
47079 }
47080
47081 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47082 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47084 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47085 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47086 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47087 };
47088 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47089 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47090
47091 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47092 while (Rdx.getValueSizeInBits() > 128) {
47093 SDValue Lo, Hi;
47094 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47095 VecVT = Lo.getValueType();
47096 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47097 }
47098 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47099
47100 if (NumElts > 8) {
47101 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47102 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47103 }
47104
47105 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47106 Rdx = DAG.getBitcast(VecVT, Rdx);
47107 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47108 }
47109
47110 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47111 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47112 return SDValue();
47113
47114 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47115
47116 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47117 // across the whole vector, so we need an extract + hop preliminary stage.
47118 // This is the only step where the operands of the hop are not the same value.
47119 // TODO: We could extend this to handle 512-bit or even longer vectors.
47120 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47121 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47122 unsigned NumElts = VecVT.getVectorNumElements();
47123 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47124 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47125 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47126 VecVT = Rdx.getValueType();
47127 }
47128 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47129 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47130 return SDValue();
47131
47132 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47133 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47134 for (unsigned i = 0; i != ReductionSteps; ++i)
47135 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47136
47137 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47138}
47139
47140/// Detect vector gather/scatter index generation and convert it from being a
47141/// bunch of shuffles and extracts into a somewhat faster sequence.
47142/// For i686, the best sequence is apparently storing the value and loading
47143/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47146 const X86Subtarget &Subtarget) {
47147 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47148 return NewOp;
47149
47150 SDValue InputVector = N->getOperand(0);
47151 SDValue EltIdx = N->getOperand(1);
47152 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47153
47154 EVT SrcVT = InputVector.getValueType();
47155 EVT VT = N->getValueType(0);
47156 SDLoc dl(InputVector);
47157 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47158 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47159 unsigned NumEltBits = VT.getScalarSizeInBits();
47160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47161
47162 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47163 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47164
47165 // Integer Constant Folding.
47166 if (CIdx && VT.isInteger()) {
47167 APInt UndefVecElts;
47168 SmallVector<APInt, 16> EltBits;
47169 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47170 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47171 EltBits, /*AllowWholeUndefs*/ true,
47172 /*AllowPartialUndefs*/ false)) {
47173 uint64_t Idx = CIdx->getZExtValue();
47174 if (UndefVecElts[Idx])
47175 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47176 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47177 }
47178
47179 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47180 // Improves lowering of bool masks on rust which splits them into byte array.
47181 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47182 SDValue Src = peekThroughBitcasts(InputVector);
47183 if (Src.getValueType().getScalarType() == MVT::i1 &&
47184 TLI.isTypeLegal(Src.getValueType())) {
47185 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47186 SDValue Sub = DAG.getNode(
47187 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47188 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47189 return DAG.getBitcast(VT, Sub);
47190 }
47191 }
47192 }
47193
47194 if (IsPextr) {
47195 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47196 DCI))
47197 return SDValue(N, 0);
47198
47199 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47200 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47201 InputVector.getOpcode() == X86ISD::PINSRW) &&
47202 InputVector.getOperand(2) == EltIdx) {
47203 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47204 "Vector type mismatch");
47205 SDValue Scl = InputVector.getOperand(1);
47206 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47207 return DAG.getZExtOrTrunc(Scl, dl, VT);
47208 }
47209
47210 // TODO - Remove this once we can handle the implicit zero-extension of
47211 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47212 // combineBasicSADPattern.
47213 return SDValue();
47214 }
47215
47216 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47217 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47218 InputVector.getOpcode() == ISD::BITCAST &&
47219 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47220 isNullConstant(EltIdx) && InputVector.hasOneUse())
47221 return DAG.getBitcast(VT, InputVector);
47222
47223 // Detect mmx to i32 conversion through a v2i32 elt extract.
47224 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47225 InputVector.getOpcode() == ISD::BITCAST &&
47226 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47227 isNullConstant(EltIdx) && InputVector.hasOneUse())
47228 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47229 InputVector.getOperand(0));
47230
47231 // Check whether this extract is the root of a sum of absolute differences
47232 // pattern. This has to be done here because we really want it to happen
47233 // pre-legalization,
47234 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47235 return SAD;
47236
47237 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47238 return VPDPBUSD;
47239
47240 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47241 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47242 return Cmp;
47243
47244 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47245 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47246 return MinMax;
47247
47248 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47249 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47250 return V;
47251
47252 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47253 return V;
47254
47255 if (CIdx)
47257 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47258 dl, DAG, DCI))
47259 return V;
47260
47261 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47262 // and then testing the relevant element.
47263 //
47264 // Note that we only combine extracts on the *same* result number, i.e.
47265 // t0 = merge_values a0, a1, a2, a3
47266 // i1 = extract_vector_elt t0, Constant:i64<2>
47267 // i1 = extract_vector_elt t0, Constant:i64<3>
47268 // but not
47269 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47270 // since the latter would need its own MOVMSK.
47271 if (SrcVT.getScalarType() == MVT::i1) {
47272 bool IsVar = !CIdx;
47273 SmallVector<SDNode *, 16> BoolExtracts;
47274 unsigned ResNo = InputVector.getResNo();
47275 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47276 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47277 Use->getOperand(0).getResNo() == ResNo &&
47278 Use->getValueType(0) == MVT::i1) {
47279 BoolExtracts.push_back(Use);
47280 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47281 return true;
47282 }
47283 return false;
47284 };
47285 // TODO: Can we drop the oneuse check for constant extracts?
47286 if (all_of(InputVector->users(), IsBoolExtract) &&
47287 (IsVar || BoolExtracts.size() > 1)) {
47288 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47289 if (SDValue BC =
47290 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47291 for (SDNode *Use : BoolExtracts) {
47292 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47293 // Mask = 1 << MaskIdx
47294 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47295 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47296 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47297 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47298 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47299 DCI.CombineTo(Use, Res);
47300 }
47301 return SDValue(N, 0);
47302 }
47303 }
47304 }
47305
47306 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47307 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47308 SDValue TruncSrc = InputVector.getOperand(0);
47309 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47310 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47311 SDValue NewExt =
47312 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47313 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47314 }
47315 }
47316
47317 return SDValue();
47318}
47319
47320// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47321// This is more or less the reverse of combineBitcastvxi1.
47323 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47324 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47325 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47326 Opcode != ISD::ANY_EXTEND)
47327 return SDValue();
47328 if (!DCI.isBeforeLegalizeOps())
47329 return SDValue();
47330 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47331 return SDValue();
47332
47333 EVT SVT = VT.getScalarType();
47334 EVT InSVT = N0.getValueType().getScalarType();
47335 unsigned EltSizeInBits = SVT.getSizeInBits();
47336
47337 // Input type must be extending a bool vector (bit-casted from a scalar
47338 // integer) to legal integer types.
47339 if (!VT.isVector())
47340 return SDValue();
47341 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47342 return SDValue();
47343 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47344 return SDValue();
47345
47346 SDValue N00 = N0.getOperand(0);
47347 EVT SclVT = N00.getValueType();
47348 if (!SclVT.isScalarInteger())
47349 return SDValue();
47350
47351 SDValue Vec;
47352 SmallVector<int> ShuffleMask;
47353 unsigned NumElts = VT.getVectorNumElements();
47354 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47355
47356 // Broadcast the scalar integer to the vector elements.
47357 if (NumElts > EltSizeInBits) {
47358 // If the scalar integer is greater than the vector element size, then we
47359 // must split it down into sub-sections for broadcasting. For example:
47360 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47361 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47362 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47363 unsigned Scale = NumElts / EltSizeInBits;
47364 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47365 bool UseBroadcast = Subtarget.hasInt256() &&
47366 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47367 Vec = UseBroadcast
47368 ? DAG.getSplat(BroadcastVT, DL, N00)
47369 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47370 Vec = DAG.getBitcast(VT, Vec);
47371
47372 for (unsigned i = 0; i != Scale; ++i) {
47373 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47374 ShuffleMask.append(EltSizeInBits, i + Offset);
47375 }
47376 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47377 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47378 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47379 // If we have register broadcast instructions, use the scalar size as the
47380 // element type for the shuffle. Then cast to the wider element type. The
47381 // widened bits won't be used, and this might allow the use of a broadcast
47382 // load.
47383 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47384 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47385 (NumElts * EltSizeInBits) / NumElts);
47386 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47387 } else {
47388 // For smaller scalar integers, we can simply any-extend it to the vector
47389 // element size (we don't care about the upper bits) and broadcast it to all
47390 // elements.
47391 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47392 }
47393
47394 // Now, mask the relevant bit in each element.
47396 for (unsigned i = 0; i != NumElts; ++i) {
47397 int BitIdx = (i % EltSizeInBits);
47398 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47399 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47400 }
47401 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47402 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47403
47404 // Compare against the bitmask and extend the result.
47405 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47406 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47407 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47408
47409 // For SEXT, this is now done, otherwise shift the result down for
47410 // zero-extension.
47411 if (Opcode == ISD::SIGN_EXTEND)
47412 return Vec;
47413 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47414 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47415}
47416
47417/// If both arms of a vector select are concatenated vectors, split the select,
47418/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47419/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47420/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47422 const X86Subtarget &Subtarget) {
47423 unsigned Opcode = N->getOpcode();
47424 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47425 return SDValue();
47426
47427 // TODO: Split 512-bit vectors too?
47428 EVT VT = N->getValueType(0);
47429 if (!VT.is256BitVector())
47430 return SDValue();
47431
47432 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47433 SDValue Cond = N->getOperand(0);
47434 SDValue TVal = N->getOperand(1);
47435 SDValue FVal = N->getOperand(2);
47436 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47437 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47438 return SDValue();
47439
47440 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47442 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47443 };
47444 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47445 /*CheckBWI*/ false);
47446}
47447
47449 const SDLoc &DL) {
47450 SDValue Cond = N->getOperand(0);
47451 SDValue LHS = N->getOperand(1);
47452 SDValue RHS = N->getOperand(2);
47453
47454 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47455 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47456 if (!TrueC || !FalseC)
47457 return SDValue();
47458
47459 // Don't do this for crazy integer types.
47460 EVT VT = N->getValueType(0);
47461 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47462 return SDValue();
47463
47464 // We're going to use the condition bit in math or logic ops. We could allow
47465 // this with a wider condition value (post-legalization it becomes an i8),
47466 // but if nothing is creating selects that late, it doesn't matter.
47467 if (Cond.getValueType() != MVT::i1)
47468 return SDValue();
47469
47470 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47471 // 3, 5, or 9 with i32/i64, so those get transformed too.
47472 // TODO: For constants that overflow or do not differ by power-of-2 or small
47473 // multiplier, convert to 'and' + 'add'.
47474 const APInt &TrueVal = TrueC->getAPIntValue();
47475 const APInt &FalseVal = FalseC->getAPIntValue();
47476
47477 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47478 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47479 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47480 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47481 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47482 return SDValue();
47483 }
47484
47485 bool OV;
47486 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47487 if (OV)
47488 return SDValue();
47489
47490 APInt AbsDiff = Diff.abs();
47491 if (AbsDiff.isPowerOf2() ||
47492 ((VT == MVT::i32 || VT == MVT::i64) &&
47493 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47494
47495 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47496 // of the condition can usually be folded into a compare predicate, but even
47497 // without that, the sequence should be cheaper than a CMOV alternative.
47498 if (TrueVal.slt(FalseVal)) {
47499 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47500 std::swap(TrueC, FalseC);
47501 }
47502
47503 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47504 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47505
47506 // Multiply condition by the difference if non-one.
47507 if (!AbsDiff.isOne())
47508 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47509
47510 // Add the base if non-zero.
47511 if (!FalseC->isZero())
47512 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47513
47514 return R;
47515 }
47516
47517 return SDValue();
47518}
47519
47520/// If this is a *dynamic* select (non-constant condition) and we can match
47521/// this node with one of the variable blend instructions, restructure the
47522/// condition so that blends can use the high (sign) bit of each element.
47523/// This function will also call SimplifyDemandedBits on already created
47524/// BLENDV to perform additional simplifications.
47526 const SDLoc &DL,
47528 const X86Subtarget &Subtarget) {
47529 SDValue Cond = N->getOperand(0);
47530 if ((N->getOpcode() != ISD::VSELECT &&
47531 N->getOpcode() != X86ISD::BLENDV) ||
47533 return SDValue();
47534
47535 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47536 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47537 EVT VT = N->getValueType(0);
47538
47539 // We can only handle the cases where VSELECT is directly legal on the
47540 // subtarget. We custom lower VSELECT nodes with constant conditions and
47541 // this makes it hard to see whether a dynamic VSELECT will correctly
47542 // lower, so we both check the operation's status and explicitly handle the
47543 // cases where a *dynamic* blend will fail even though a constant-condition
47544 // blend could be custom lowered.
47545 // FIXME: We should find a better way to handle this class of problems.
47546 // Potentially, we should combine constant-condition vselect nodes
47547 // pre-legalization into shuffles and not mark as many types as custom
47548 // lowered.
47550 return SDValue();
47551 // FIXME: We don't support i16-element blends currently. We could and
47552 // should support them by making *all* the bits in the condition be set
47553 // rather than just the high bit and using an i8-element blend.
47554 if (VT.getVectorElementType() == MVT::i16)
47555 return SDValue();
47556 // Dynamic blending was only available from SSE4.1 onward.
47557 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47558 return SDValue();
47559 // Byte blends are only available in AVX2
47560 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47561 return SDValue();
47562 // There are no 512-bit blend instructions that use sign bits.
47563 if (VT.is512BitVector())
47564 return SDValue();
47565
47566 // Don't optimize before the condition has been transformed to a legal type
47567 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47569 return SDValue();
47570
47571 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47572 for (SDUse &Use : Cond->uses())
47573 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47574 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47575 Use.getOperandNo() != 0)
47576 return false;
47577
47578 return true;
47579 };
47580
47582
47583 if (OnlyUsedAsSelectCond(Cond)) {
47584 KnownBits Known;
47586 !DCI.isBeforeLegalizeOps());
47587 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47588 return SDValue();
47589
47590 // If we changed the computation somewhere in the DAG, this change will
47591 // affect all users of Cond. Update all the nodes so that we do not use
47592 // the generic VSELECT anymore. Otherwise, we may perform wrong
47593 // optimizations as we messed with the actual expectation for the vector
47594 // boolean values.
47595 for (SDNode *U : Cond->users()) {
47596 if (U->getOpcode() == X86ISD::BLENDV)
47597 continue;
47598
47599 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47600 Cond, U->getOperand(1), U->getOperand(2));
47601 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47602 DCI.AddToWorklist(U);
47603 }
47604 DCI.CommitTargetLoweringOpt(TLO);
47605 return SDValue(N, 0);
47606 }
47607
47608 // Otherwise we can still at least try to simplify multiple use bits.
47610 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47611 N->getOperand(1), N->getOperand(2));
47612
47613 return SDValue();
47614}
47615
47616// Try to match:
47617// (or (and (M, (sub 0, X)), (pandn M, X)))
47618// which is a special case of:
47619// (select M, (sub 0, X), X)
47620// Per:
47621// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47622// We know that, if fNegate is 0 or 1:
47623// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47624//
47625// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47626// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47627// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47628// This lets us transform our vselect to:
47629// (add (xor X, M), (and M, 1))
47630// And further to:
47631// (sub (xor X, M), M)
47633 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47634 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47635 using namespace SDPatternMatch;
47636 EVT MaskVT = Mask.getValueType();
47637 assert(MaskVT.isInteger() &&
47638 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47639 "Mask must be zero/all-bits");
47640
47641 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47643 return SDValue();
47644
47645 SDValue V;
47646 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47648 return SDValue();
47649
47650 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47651 SDValue SubOp2 = Mask;
47652
47653 // If the negate was on the false side of the select, then
47654 // the operands of the SUB need to be swapped. PR 27251.
47655 // This is because the pattern being matched above is
47656 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47657 // but if the pattern matched was
47658 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47659 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47660 // pattern also needs to be a negation of the replacement pattern above.
47661 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47662 // sub accomplishes the negation of the replacement pattern.
47663 if (V == Y)
47664 std::swap(SubOp1, SubOp2);
47665
47666 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47667 return DAG.getBitcast(VT, Res);
47668}
47669
47671 const X86Subtarget &Subtarget) {
47672 using namespace SDPatternMatch;
47673 if (!Subtarget.hasAVX512())
47674 return SDValue();
47675
47676 ISD::CondCode CC;
47677 SDValue Cond, X, Y, LHS, RHS;
47680 m_CondCode(CC)))),
47681 m_Value(LHS), m_Value(RHS))))
47682 return SDValue();
47683
47684 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47685 !canCombineAsMaskOperation(RHS, Subtarget))
47686 return SDValue();
47687
47688 // Commute LHS and RHS to create opportunity to select mask instruction.
47689 // (vselect M, L, R) -> (vselect ~M, R, L)
47690 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47691 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47692 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47693}
47694
47695/// Do target-specific dag combines on SELECT and VSELECT nodes.
47698 const X86Subtarget &Subtarget) {
47699 SDLoc DL(N);
47700 SDValue Cond = N->getOperand(0);
47701 SDValue LHS = N->getOperand(1);
47702 SDValue RHS = N->getOperand(2);
47703
47704 // Try simplification again because we use this function to optimize
47705 // BLENDV nodes that are not handled by the generic combiner.
47706 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47707 return V;
47708
47709 // When avx512 is available the lhs operand of select instruction can be
47710 // folded with mask instruction, while the rhs operand can't. Commute the
47711 // lhs and rhs of the select instruction to create the opportunity of
47712 // folding.
47713 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47714 return V;
47715
47716 EVT VT = LHS.getValueType();
47717 EVT CondVT = Cond.getValueType();
47718 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47719 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47720
47721 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47722 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47723 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47724 if (CondVT.isVector() && CondVT.isInteger() &&
47725 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47726 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47729 DL, DAG, Subtarget))
47730 return V;
47731
47732 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47733 SmallVector<int, 64> CondMask;
47734 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47735 N->getOpcode() == X86ISD::BLENDV)) {
47736 // Convert vselects with constant condition into shuffles.
47737 if (DCI.isBeforeLegalizeOps())
47738 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47739
47740 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47741 // by forcing the unselected elements to zero.
47742 // TODO: Can we handle more shuffles with this?
47743 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47744 SmallVector<SDValue, 1> LHSOps, RHSOps;
47745 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47748 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47749 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47750 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47751 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47752 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47753 assert(ByteMask.size() == LHSMask.size() &&
47754 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47755 for (auto [I, M] : enumerate(ByteMask)) {
47756 // getConstVector sets negative shuffle mask values as undef, so
47757 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47758 if (M < (int)ByteMask.size()) {
47759 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47760 RHSMask[I] = 0x80;
47761 } else {
47762 LHSMask[I] = 0x80;
47763 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47764 }
47765 }
47766 MVT ByteVT = LHSShuf.getSimpleValueType();
47767 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47768 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47769 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47770 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47771 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47772 }
47773 }
47774
47775 // Attempt to combine as shuffle.
47776 SDValue Op(N, 0);
47777 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47778 return Res;
47779 }
47780 }
47781
47782 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47783 // instructions match the semantics of the common C idiom x<y?x:y but not
47784 // x<=y?x:y, because of how they handle negative zero (which can be
47785 // ignored in unsafe-math mode).
47786 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47787 if ((Cond.getOpcode() == ISD::SETCC ||
47788 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47789 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47790 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47791 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47792 (Subtarget.hasSSE2() ||
47793 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47794 bool IsStrict = Cond->isStrictFPOpcode();
47795 ISD::CondCode CC =
47796 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47797 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47798 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47799
47800 unsigned Opcode = 0;
47801 // Check for x CC y ? x : y.
47802 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47803 switch (CC) {
47804 default: break;
47805 case ISD::SETULT:
47806 // Converting this to a min would handle NaNs incorrectly, and swapping
47807 // the operands would cause it to handle comparisons between positive
47808 // and negative zero incorrectly.
47809 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47811 !(DAG.isKnownNeverZeroFloat(LHS) ||
47813 break;
47814 std::swap(LHS, RHS);
47815 }
47816 Opcode = X86ISD::FMIN;
47817 break;
47818 case ISD::SETOLE:
47819 // Converting this to a min would handle comparisons between positive
47820 // and negative zero incorrectly.
47823 break;
47824 Opcode = X86ISD::FMIN;
47825 break;
47826 case ISD::SETULE:
47827 // Converting this to a min would handle both negative zeros and NaNs
47828 // incorrectly, but we can swap the operands to fix both.
47829 std::swap(LHS, RHS);
47830 [[fallthrough]];
47831 case ISD::SETOLT:
47832 case ISD::SETLT:
47833 case ISD::SETLE:
47834 Opcode = X86ISD::FMIN;
47835 break;
47836
47837 case ISD::SETOGE:
47838 // Converting this to a max would handle comparisons between positive
47839 // and negative zero incorrectly.
47842 break;
47843 Opcode = X86ISD::FMAX;
47844 break;
47845 case ISD::SETUGT:
47846 // Converting this to a max would handle NaNs incorrectly, and swapping
47847 // the operands would cause it to handle comparisons between positive
47848 // and negative zero incorrectly.
47849 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47851 !(DAG.isKnownNeverZeroFloat(LHS) ||
47853 break;
47854 std::swap(LHS, RHS);
47855 }
47856 Opcode = X86ISD::FMAX;
47857 break;
47858 case ISD::SETUGE:
47859 // Converting this to a max would handle both negative zeros and NaNs
47860 // incorrectly, but we can swap the operands to fix both.
47861 std::swap(LHS, RHS);
47862 [[fallthrough]];
47863 case ISD::SETOGT:
47864 case ISD::SETGT:
47865 case ISD::SETGE:
47866 Opcode = X86ISD::FMAX;
47867 break;
47868 }
47869 // Check for x CC y ? y : x -- a min/max with reversed arms.
47870 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47871 switch (CC) {
47872 default: break;
47873 case ISD::SETOGE:
47874 // Converting this to a min would handle comparisons between positive
47875 // and negative zero incorrectly, and swapping the operands would
47876 // cause it to handle NaNs incorrectly.
47878 !(DAG.isKnownNeverZeroFloat(LHS) ||
47879 DAG.isKnownNeverZeroFloat(RHS))) {
47880 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47881 break;
47882 std::swap(LHS, RHS);
47883 }
47884 Opcode = X86ISD::FMIN;
47885 break;
47886 case ISD::SETUGT:
47887 // Converting this to a min would handle NaNs incorrectly.
47888 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47889 break;
47890 Opcode = X86ISD::FMIN;
47891 break;
47892 case ISD::SETUGE:
47893 // Converting this to a min would handle both negative zeros and NaNs
47894 // incorrectly, but we can swap the operands to fix both.
47895 std::swap(LHS, RHS);
47896 [[fallthrough]];
47897 case ISD::SETOGT:
47898 case ISD::SETGT:
47899 case ISD::SETGE:
47900 Opcode = X86ISD::FMIN;
47901 break;
47902
47903 case ISD::SETULT:
47904 // Converting this to a max would handle NaNs incorrectly.
47905 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47906 break;
47907 Opcode = X86ISD::FMAX;
47908 break;
47909 case ISD::SETOLE:
47910 // Converting this to a max would handle comparisons between positive
47911 // and negative zero incorrectly, and swapping the operands would
47912 // cause it to handle NaNs incorrectly.
47914 !DAG.isKnownNeverZeroFloat(LHS) &&
47915 !DAG.isKnownNeverZeroFloat(RHS)) {
47916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47917 break;
47918 std::swap(LHS, RHS);
47919 }
47920 Opcode = X86ISD::FMAX;
47921 break;
47922 case ISD::SETULE:
47923 // Converting this to a max would handle both negative zeros and NaNs
47924 // incorrectly, but we can swap the operands to fix both.
47925 std::swap(LHS, RHS);
47926 [[fallthrough]];
47927 case ISD::SETOLT:
47928 case ISD::SETLT:
47929 case ISD::SETLE:
47930 Opcode = X86ISD::FMAX;
47931 break;
47932 }
47933 }
47934
47935 if (Opcode) {
47936 if (IsStrict) {
47937 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47939 DL, {N->getValueType(0), MVT::Other},
47940 {Cond.getOperand(0), LHS, RHS});
47941 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47942 return Ret;
47943 }
47944 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47945 }
47946 }
47947
47948 // Some mask scalar intrinsics rely on checking if only one bit is set
47949 // and implement it in C code like this:
47950 // A[0] = (U & 1) ? A[0] : W[0];
47951 // This creates some redundant instructions that break pattern matching.
47952 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47953 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47954 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47955 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47956 SDValue AndNode = Cond.getOperand(0);
47957 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47958 isNullConstant(Cond.getOperand(1)) &&
47959 isOneConstant(AndNode.getOperand(1))) {
47960 // LHS and RHS swapped due to
47961 // setcc outputting 1 when AND resulted in 0 and vice versa.
47962 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47963 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47964 }
47965 }
47966
47967 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47968 // lowering on KNL. In this case we convert it to
47969 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47970 // The same situation all vectors of i8 and i16 without BWI.
47971 // Make sure we extend these even before type legalization gets a chance to
47972 // split wide vectors.
47973 // Since SKX these selects have a proper lowering.
47974 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47975 CondVT.getVectorElementType() == MVT::i1 &&
47976 (VT.getVectorElementType() == MVT::i8 ||
47977 VT.getVectorElementType() == MVT::i16)) {
47978 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47979 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47980 }
47981
47982 // AVX512 - Extend select to merge with target shuffle.
47983 // select(mask, extract_subvector(shuffle(x)), y) -->
47984 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47985 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47986 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47987 CondVT.getVectorElementType() == MVT::i1) {
47988 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47989 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47990 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47991 isNullConstant(Op.getOperand(1)) &&
47992 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47993 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47994 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47995 ISD::isBuildVectorAllZeros(Alt.getNode()));
47996 };
47997
47998 bool SelectableLHS = SelectableOp(LHS, RHS);
47999 bool SelectableRHS = SelectableOp(RHS, LHS);
48000 if (SelectableLHS || SelectableRHS) {
48001 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48002 : RHS.getOperand(0).getValueType();
48003 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48004 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48005 VT.getSizeInBits());
48006 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48007 VT.getSizeInBits());
48008 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48009 DAG.getUNDEF(SrcCondVT), Cond,
48010 DAG.getVectorIdxConstant(0, DL));
48011 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48012 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48013 }
48014 }
48015
48016 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48017 return V;
48018
48019 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48020 Cond.hasOneUse()) {
48021 EVT CondVT = Cond.getValueType();
48022 SDValue Cond0 = Cond.getOperand(0);
48023 SDValue Cond1 = Cond.getOperand(1);
48024 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48025
48026 // Canonicalize min/max:
48027 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48028 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48029 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48030 // the need for an extra compare against zero. e.g.
48031 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48032 // subl %esi, %edi
48033 // testl %edi, %edi
48034 // movl $0, %eax
48035 // cmovgl %edi, %eax
48036 // =>
48037 // xorl %eax, %eax
48038 // subl %esi, $edi
48039 // cmovsl %eax, %edi
48040 //
48041 // We can also canonicalize
48042 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48043 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48044 // This allows the use of a test instruction for the compare.
48045 if (LHS == Cond0 && RHS == Cond1) {
48046 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48047 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48049 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48050 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48051 }
48052 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48053 ISD::CondCode NewCC = ISD::SETUGE;
48054 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48055 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48056 }
48057 }
48058
48059 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48060 // fold eq + gt/lt nested selects into ge/le selects
48061 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48062 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48063 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48064 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48065 // .. etc ..
48066 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48067 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48068 SDValue InnerSetCC = RHS.getOperand(0);
48069 ISD::CondCode InnerCC =
48070 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48071 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48072 Cond0 == InnerSetCC.getOperand(0) &&
48073 Cond1 == InnerSetCC.getOperand(1)) {
48074 ISD::CondCode NewCC;
48075 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48076 // clang-format off
48077 case ISD::SETGT: NewCC = ISD::SETGE; break;
48078 case ISD::SETLT: NewCC = ISD::SETLE; break;
48079 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48080 case ISD::SETULT: NewCC = ISD::SETULE; break;
48081 default: NewCC = ISD::SETCC_INVALID; break;
48082 // clang-format on
48083 }
48084 if (NewCC != ISD::SETCC_INVALID) {
48085 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48086 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48087 }
48088 }
48089 }
48090 }
48091
48092 // Check if the first operand is all zeros and Cond type is vXi1.
48093 // If this an avx512 target we can improve the use of zero masking by
48094 // swapping the operands and inverting the condition.
48095 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48096 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48097 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48098 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48099 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48100 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48101 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48102 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48103 }
48104
48105 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48106 // get split by legalization.
48107 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48108 CondVT.getVectorElementType() == MVT::i1 &&
48109 TLI.isTypeLegal(VT.getScalarType())) {
48110 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48112 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48113 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48114 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48115 }
48116 }
48117
48118 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48119 // with out-of-bounds clamping.
48120
48121 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48122 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48123 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48124 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48125 // exceeding bitwidth-1.
48126 if (N->getOpcode() == ISD::VSELECT) {
48127 using namespace llvm::SDPatternMatch;
48128 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48129 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48130 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48131 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48133 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48136 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48137 : X86ISD::VSHLV,
48138 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48139 }
48140 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48141 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48142 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48143 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48145 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48148 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48149 : X86ISD::VSHLV,
48150 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48151 }
48152 }
48153
48154 // Early exit check
48155 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48156 return SDValue();
48157
48158 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48159 return V;
48160
48161 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48162 return V;
48163
48164 // select(~Cond, X, Y) -> select(Cond, Y, X)
48165 if (CondVT.getScalarType() != MVT::i1) {
48166 if (SDValue CondNot = IsNOT(Cond, DAG))
48167 return DAG.getNode(N->getOpcode(), DL, VT,
48168 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48169
48170 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48171 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48172 Cond.getOperand(0).getOpcode() == ISD::AND &&
48173 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48174 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48175 Cond.getScalarValueSizeInBits(),
48176 /*AllowUndefs=*/true) &&
48177 Cond.hasOneUse()) {
48178 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48179 Cond.getOperand(0).getOperand(1));
48180 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48181 }
48182
48183 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48184 // signbit.
48185 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48186 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48187 Cond.hasOneUse()) {
48188 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48189 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48190 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48191 }
48192 }
48193
48194 // Try to optimize vXi1 selects if both operands are either all constants or
48195 // bitcasts from scalar integer type. In that case we can convert the operands
48196 // to integer and use an integer select which will be converted to a CMOV.
48197 // We need to take a little bit of care to avoid creating an i64 type after
48198 // type legalization.
48199 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48200 VT.getVectorElementType() == MVT::i1 &&
48201 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48203 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48204 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48205 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48206
48207 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48208 LHS.getOperand(0).getValueType() == IntVT)) &&
48209 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48210 RHS.getOperand(0).getValueType() == IntVT))) {
48211 if (LHSIsConst)
48213 else
48214 LHS = LHS.getOperand(0);
48215
48216 if (RHSIsConst)
48218 else
48219 RHS = RHS.getOperand(0);
48220
48221 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48222 return DAG.getBitcast(VT, Select);
48223 }
48224 }
48225 }
48226
48227 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48228 // single bits, then invert the predicate and swap the select operands.
48229 // This can lower using a vector shift bit-hack rather than mask and compare.
48230 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48231 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48232 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48233 Cond.getOperand(0).getOpcode() == ISD::AND &&
48234 isNullOrNullSplat(Cond.getOperand(1)) &&
48235 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48236 Cond.getOperand(0).getValueType() == VT) {
48237 // The 'and' mask must be composed of power-of-2 constants.
48238 SDValue And = Cond.getOperand(0);
48239 auto *C = isConstOrConstSplat(And.getOperand(1));
48240 if (C && C->getAPIntValue().isPowerOf2()) {
48241 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48242 SDValue NotCond =
48243 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48244 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48245 }
48246
48247 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48248 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48249 // 16-bit lacks a proper blendv.
48250 unsigned EltBitWidth = VT.getScalarSizeInBits();
48251 bool CanShiftBlend =
48252 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48253 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48254 (Subtarget.hasXOP()));
48255 if (CanShiftBlend &&
48256 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48257 return C->getAPIntValue().isPowerOf2();
48258 })) {
48259 // Create a left-shift constant to get the mask bits over to the sign-bit.
48260 SDValue Mask = And.getOperand(1);
48261 SmallVector<int, 32> ShlVals;
48262 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48263 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48264 ShlVals.push_back(EltBitWidth - 1 -
48265 MaskVal->getAPIntValue().exactLogBase2());
48266 }
48267 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48268 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48269 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48270 SDValue NewCond =
48271 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48272 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48273 }
48274 }
48275
48276 return SDValue();
48277}
48278
48279/// Combine:
48280/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48281/// to:
48282/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48283/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48284/// Note that this is only legal for some op/cc combinations.
48286 SelectionDAG &DAG,
48287 const X86Subtarget &Subtarget) {
48288 // This combine only operates on CMP-like nodes.
48289 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48290 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48291 return SDValue();
48292
48293 // Can't replace the cmp if it has more uses than the one we're looking at.
48294 // FIXME: We would like to be able to handle this, but would need to make sure
48295 // all uses were updated.
48296 if (!Cmp.hasOneUse())
48297 return SDValue();
48298
48299 // This only applies to variations of the common case:
48300 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48301 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48302 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48303 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48304 // Using the proper condcodes (see below), overflow is checked for.
48305
48306 // FIXME: We can generalize both constraints:
48307 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48308 // - LHS != 1
48309 // if the result is compared.
48310
48311 SDValue CmpLHS = Cmp.getOperand(0);
48312 SDValue CmpRHS = Cmp.getOperand(1);
48313 EVT CmpVT = CmpLHS.getValueType();
48314
48315 if (!CmpLHS.hasOneUse())
48316 return SDValue();
48317
48318 unsigned Opc = CmpLHS.getOpcode();
48319 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48320 return SDValue();
48321
48322 SDValue OpRHS = CmpLHS.getOperand(2);
48323 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48324 if (!OpRHSC)
48325 return SDValue();
48326
48327 APInt Addend = OpRHSC->getAPIntValue();
48328 if (Opc == ISD::ATOMIC_LOAD_SUB)
48329 Addend = -Addend;
48330
48331 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48332 if (!CmpRHSC)
48333 return SDValue();
48334
48335 APInt Comparison = CmpRHSC->getAPIntValue();
48336 APInt NegAddend = -Addend;
48337
48338 // See if we can adjust the CC to make the comparison match the negated
48339 // addend.
48340 if (Comparison != NegAddend) {
48341 APInt IncComparison = Comparison + 1;
48342 if (IncComparison == NegAddend) {
48343 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48344 Comparison = IncComparison;
48345 CC = X86::COND_AE;
48346 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48347 Comparison = IncComparison;
48348 CC = X86::COND_L;
48349 }
48350 }
48351 APInt DecComparison = Comparison - 1;
48352 if (DecComparison == NegAddend) {
48353 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48354 Comparison = DecComparison;
48355 CC = X86::COND_A;
48356 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48357 Comparison = DecComparison;
48358 CC = X86::COND_LE;
48359 }
48360 }
48361 }
48362
48363 // If the addend is the negation of the comparison value, then we can do
48364 // a full comparison by emitting the atomic arithmetic as a locked sub.
48365 if (Comparison == NegAddend) {
48366 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48367 // atomic sub.
48368 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48369 auto AtomicSub = DAG.getAtomic(
48370 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48371 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48372 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48373 AN->getMemOperand());
48374 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48375 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48376 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48377 return LockOp;
48378 }
48379
48380 // We can handle comparisons with zero in a number of cases by manipulating
48381 // the CC used.
48382 if (!Comparison.isZero())
48383 return SDValue();
48384
48385 if (CC == X86::COND_S && Addend == 1)
48386 CC = X86::COND_LE;
48387 else if (CC == X86::COND_NS && Addend == 1)
48388 CC = X86::COND_G;
48389 else if (CC == X86::COND_G && Addend == -1)
48390 CC = X86::COND_GE;
48391 else if (CC == X86::COND_LE && Addend == -1)
48392 CC = X86::COND_L;
48393 else
48394 return SDValue();
48395
48396 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48397 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48398 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48399 return LockOp;
48400}
48401
48402// Check whether we're just testing the signbit, and whether we can simplify
48403// this by tracking where the signbit came from.
48405 SelectionDAG &DAG) {
48406 if (CC != X86::COND_S && CC != X86::COND_NS)
48407 return SDValue();
48408
48409 if (!Cmp.hasOneUse())
48410 return SDValue();
48411
48412 SDValue Src;
48413 if (Cmp.getOpcode() == X86ISD::CMP) {
48414 // CMP(X,0) -> signbit test
48415 if (!isNullConstant(Cmp.getOperand(1)))
48416 return SDValue();
48417 Src = Cmp.getOperand(0);
48418 // Peek through a SRA node as we just need the signbit.
48419 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48420 // TODO: Use SimplifyDemandedBits instead of just SRA?
48421 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48422 return SDValue();
48423 Src = Src.getOperand(0);
48424 } else if (Cmp.getOpcode() == X86ISD::OR) {
48425 // OR(X,Y) -> see if only one operand contributes to the signbit.
48426 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48427 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48428 Src = Cmp.getOperand(1);
48429 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48430 Src = Cmp.getOperand(0);
48431 else
48432 return SDValue();
48433 } else {
48434 return SDValue();
48435 }
48436
48437 // Replace with a TEST on the MSB.
48438 SDLoc DL(Cmp);
48439 MVT SrcVT = Src.getSimpleValueType();
48440 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48441
48442 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48443 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48444 if (Src.getOpcode() == ISD::SHL) {
48445 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48446 Src = Src.getOperand(0);
48447 BitMask.lshrInPlace(*ShiftAmt);
48448 }
48449 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48450 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48451 Src = Src.getOperand(0);
48452 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48453 }
48454
48455 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48456 DAG.getConstant(BitMask, DL, SrcVT));
48457 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48458 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48459 DAG.getConstant(0, DL, SrcVT));
48460}
48461
48462// Check whether a boolean test is testing a boolean value generated by
48463// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48464// code.
48465//
48466// Simplify the following patterns:
48467// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48468// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48469// to (Op EFLAGS Cond)
48470//
48471// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48472// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48473// to (Op EFLAGS !Cond)
48474//
48475// where Op could be BRCOND or CMOV.
48476//
48478 // This combine only operates on CMP-like nodes.
48479 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48480 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48481 return SDValue();
48482
48483 // Quit if not used as a boolean value.
48484 if (CC != X86::COND_E && CC != X86::COND_NE)
48485 return SDValue();
48486
48487 // Check CMP operands. One of them should be 0 or 1 and the other should be
48488 // an SetCC or extended from it.
48489 SDValue Op1 = Cmp.getOperand(0);
48490 SDValue Op2 = Cmp.getOperand(1);
48491
48492 SDValue SetCC;
48493 const ConstantSDNode* C = nullptr;
48494 bool needOppositeCond = (CC == X86::COND_E);
48495 bool checkAgainstTrue = false; // Is it a comparison against 1?
48496
48497 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48498 SetCC = Op2;
48499 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48500 SetCC = Op1;
48501 else // Quit if all operands are not constants.
48502 return SDValue();
48503
48504 if (C->getZExtValue() == 1) {
48505 needOppositeCond = !needOppositeCond;
48506 checkAgainstTrue = true;
48507 } else if (C->getZExtValue() != 0)
48508 // Quit if the constant is neither 0 or 1.
48509 return SDValue();
48510
48511 bool truncatedToBoolWithAnd = false;
48512 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48513 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48514 SetCC.getOpcode() == ISD::TRUNCATE ||
48515 SetCC.getOpcode() == ISD::AND) {
48516 if (SetCC.getOpcode() == ISD::AND) {
48517 int OpIdx = -1;
48518 if (isOneConstant(SetCC.getOperand(0)))
48519 OpIdx = 1;
48520 if (isOneConstant(SetCC.getOperand(1)))
48521 OpIdx = 0;
48522 if (OpIdx < 0)
48523 break;
48524 SetCC = SetCC.getOperand(OpIdx);
48525 truncatedToBoolWithAnd = true;
48526 } else
48527 SetCC = SetCC.getOperand(0);
48528 }
48529
48530 switch (SetCC.getOpcode()) {
48532 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48533 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48534 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48535 // truncated to i1 using 'and'.
48536 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48537 break;
48539 "Invalid use of SETCC_CARRY!");
48540 [[fallthrough]];
48541 case X86ISD::SETCC:
48542 // Set the condition code or opposite one if necessary.
48543 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48544 if (needOppositeCond)
48546 return SetCC.getOperand(1);
48547 case X86ISD::CMOV: {
48548 // Check whether false/true value has canonical one, i.e. 0 or 1.
48551 // Quit if true value is not a constant.
48552 if (!TVal)
48553 return SDValue();
48554 // Quit if false value is not a constant.
48555 if (!FVal) {
48556 SDValue Op = SetCC.getOperand(0);
48557 // Skip 'zext' or 'trunc' node.
48558 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48559 Op.getOpcode() == ISD::TRUNCATE)
48560 Op = Op.getOperand(0);
48561 // A special case for rdrand/rdseed, where 0 is set if false cond is
48562 // found.
48563 if ((Op.getOpcode() != X86ISD::RDRAND &&
48564 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48565 return SDValue();
48566 }
48567 // Quit if false value is not the constant 0 or 1.
48568 bool FValIsFalse = true;
48569 if (FVal && FVal->getZExtValue() != 0) {
48570 if (FVal->getZExtValue() != 1)
48571 return SDValue();
48572 // If FVal is 1, opposite cond is needed.
48573 needOppositeCond = !needOppositeCond;
48574 FValIsFalse = false;
48575 }
48576 // Quit if TVal is not the constant opposite of FVal.
48577 if (FValIsFalse && TVal->getZExtValue() != 1)
48578 return SDValue();
48579 if (!FValIsFalse && TVal->getZExtValue() != 0)
48580 return SDValue();
48581 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48582 if (needOppositeCond)
48584 return SetCC.getOperand(3);
48585 }
48586 }
48587
48588 return SDValue();
48589}
48590
48591/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48592/// Match:
48593/// (X86or (X86setcc) (X86setcc))
48594/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48596 X86::CondCode &CC1, SDValue &Flags,
48597 bool &isAnd) {
48598 if (Cond->getOpcode() == X86ISD::CMP) {
48599 if (!isNullConstant(Cond->getOperand(1)))
48600 return false;
48601
48602 Cond = Cond->getOperand(0);
48603 }
48604
48605 isAnd = false;
48606
48607 SDValue SetCC0, SetCC1;
48608 switch (Cond->getOpcode()) {
48609 default: return false;
48610 case ISD::AND:
48611 case X86ISD::AND:
48612 isAnd = true;
48613 [[fallthrough]];
48614 case ISD::OR:
48615 case X86ISD::OR:
48616 SetCC0 = Cond->getOperand(0);
48617 SetCC1 = Cond->getOperand(1);
48618 break;
48619 };
48620
48621 // Make sure we have SETCC nodes, using the same flags value.
48622 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48623 SetCC1.getOpcode() != X86ISD::SETCC ||
48624 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48625 return false;
48626
48627 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48628 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48629 Flags = SetCC0->getOperand(1);
48630 return true;
48631}
48632
48633// When legalizing carry, we create carries via add X, -1
48634// If that comes from an actual carry, via setcc, we use the
48635// carry directly.
48637 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48638 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48639 bool FoundAndLSB = false;
48640 SDValue Carry = EFLAGS.getOperand(0);
48641 while (Carry.getOpcode() == ISD::TRUNCATE ||
48642 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48643 (Carry.getOpcode() == ISD::AND &&
48644 isOneConstant(Carry.getOperand(1)))) {
48645 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48646 Carry = Carry.getOperand(0);
48647 }
48648 if (Carry.getOpcode() == X86ISD::SETCC ||
48649 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48650 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48651 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48652 SDValue CarryOp1 = Carry.getOperand(1);
48653 if (CarryCC == X86::COND_B)
48654 return CarryOp1;
48655 if (CarryCC == X86::COND_A) {
48656 // Try to convert COND_A into COND_B in an attempt to facilitate
48657 // materializing "setb reg".
48658 //
48659 // Do not flip "e > c", where "c" is a constant, because Cmp
48660 // instruction cannot take an immediate as its first operand.
48661 //
48662 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48663 CarryOp1.getNode()->hasOneUse() &&
48664 CarryOp1.getValueType().isInteger() &&
48665 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48666 SDValue SubCommute =
48667 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48668 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48669 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48670 }
48671 }
48672 // If this is a check of the z flag of an add with 1, switch to the
48673 // C flag.
48674 if (CarryCC == X86::COND_E &&
48675 CarryOp1.getOpcode() == X86ISD::ADD &&
48676 isOneConstant(CarryOp1.getOperand(1)))
48677 return CarryOp1;
48678 } else if (FoundAndLSB) {
48679 SDLoc DL(Carry);
48680 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48681 if (Carry.getOpcode() == ISD::SRL) {
48682 BitNo = Carry.getOperand(1);
48683 Carry = Carry.getOperand(0);
48684 }
48685 return getBT(Carry, BitNo, DL, DAG);
48686 }
48687 }
48688 }
48689
48690 return SDValue();
48691}
48692
48693/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48694/// to avoid the inversion.
48696 SelectionDAG &DAG,
48697 const X86Subtarget &Subtarget) {
48698 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48699 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48700 EFLAGS.getOpcode() != X86ISD::TESTP)
48701 return SDValue();
48702
48703 // PTEST/TESTP sets EFLAGS as:
48704 // TESTZ: ZF = (Op0 & Op1) == 0
48705 // TESTC: CF = (~Op0 & Op1) == 0
48706 // TESTNZC: ZF == 0 && CF == 0
48707 MVT VT = EFLAGS.getSimpleValueType();
48708 SDValue Op0 = EFLAGS.getOperand(0);
48709 SDValue Op1 = EFLAGS.getOperand(1);
48710 MVT OpVT = Op0.getSimpleValueType();
48711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48712
48713 // TEST*(~X,Y) == TEST*(X,Y)
48714 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48715 X86::CondCode InvCC;
48716 switch (CC) {
48717 case X86::COND_B:
48718 // testc -> testz.
48719 InvCC = X86::COND_E;
48720 break;
48721 case X86::COND_AE:
48722 // !testc -> !testz.
48723 InvCC = X86::COND_NE;
48724 break;
48725 case X86::COND_E:
48726 // testz -> testc.
48727 InvCC = X86::COND_B;
48728 break;
48729 case X86::COND_NE:
48730 // !testz -> !testc.
48731 InvCC = X86::COND_AE;
48732 break;
48733 case X86::COND_A:
48734 case X86::COND_BE:
48735 // testnzc -> testnzc (no change).
48736 InvCC = CC;
48737 break;
48738 default:
48739 InvCC = X86::COND_INVALID;
48740 break;
48741 }
48742
48743 if (InvCC != X86::COND_INVALID) {
48744 CC = InvCC;
48745 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48746 DAG.getBitcast(OpVT, NotOp0), Op1);
48747 }
48748 }
48749
48750 if (CC == X86::COND_B || CC == X86::COND_AE) {
48751 // TESTC(X,~X) == TESTC(X,-1)
48752 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48753 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48754 SDLoc DL(EFLAGS);
48755 return DAG.getNode(
48756 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48757 DAG.getBitcast(OpVT,
48758 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48759 }
48760 }
48761 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48762 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48764 SDValue BC0 = peekThroughBitcasts(Op0);
48765 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48767 SDLoc DL(EFLAGS);
48768 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48769 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48770 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48771 }
48772 }
48773 }
48774
48775 if (CC == X86::COND_E || CC == X86::COND_NE) {
48776 // TESTZ(X,~Y) == TESTC(Y,X)
48777 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48778 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48779 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48780 DAG.getBitcast(OpVT, NotOp1), Op0);
48781 }
48782
48783 if (Op0 == Op1) {
48784 SDValue BC = peekThroughBitcasts(Op0);
48785 EVT BCVT = BC.getValueType();
48786
48787 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48788 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48789 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48790 DAG.getBitcast(OpVT, BC.getOperand(0)),
48791 DAG.getBitcast(OpVT, BC.getOperand(1)));
48792 }
48793
48794 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48795 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48796 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48797 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48798 DAG.getBitcast(OpVT, BC.getOperand(0)),
48799 DAG.getBitcast(OpVT, BC.getOperand(1)));
48800 }
48801
48802 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48803 // to more efficiently extract the sign bits and compare that.
48804 // TODO: Handle TESTC with comparison inversion.
48805 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48806 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48807 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48808 unsigned EltBits = BCVT.getScalarSizeInBits();
48809 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48810 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48811 APInt SignMask = APInt::getSignMask(EltBits);
48812 if (SDValue Res =
48813 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48814 // For vXi16 cases we need to use pmovmksb and extract every other
48815 // sign bit.
48816 SDLoc DL(EFLAGS);
48817 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48818 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48819 MVT FloatVT =
48820 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48821 Res = DAG.getBitcast(FloatVT, Res);
48822 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48823 } else if (EltBits == 16) {
48824 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48825 Res = DAG.getBitcast(MovmskVT, Res);
48826 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48827 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48828 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48829 } else {
48830 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48831 }
48832 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48833 DAG.getConstant(0, DL, MVT::i32));
48834 }
48835 }
48836 }
48837 }
48838
48839 // TESTZ(-1,X) == TESTZ(X,X)
48841 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48842
48843 // TESTZ(X,-1) == TESTZ(X,X)
48845 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48846
48847 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48848 // TODO: Add COND_NE handling?
48849 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48850 SDValue Src0 = peekThroughBitcasts(Op0);
48851 SDValue Src1 = peekThroughBitcasts(Op1);
48852 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48854 peekThroughBitcasts(Src0.getOperand(1)), true);
48856 peekThroughBitcasts(Src1.getOperand(1)), true);
48857 if (Src0 && Src1) {
48858 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48859 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48860 DAG.getBitcast(OpVT2, Src0),
48861 DAG.getBitcast(OpVT2, Src1));
48862 }
48863 }
48864 }
48865 }
48866
48867 return SDValue();
48868}
48869
48870// Attempt to simplify the MOVMSK input based on the comparison type.
48872 SelectionDAG &DAG,
48873 const X86Subtarget &Subtarget) {
48874 // Handle eq/ne against zero (any_of).
48875 // Handle eq/ne against -1 (all_of).
48876 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48877 return SDValue();
48878 if (EFLAGS.getValueType() != MVT::i32)
48879 return SDValue();
48880 unsigned CmpOpcode = EFLAGS.getOpcode();
48881 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48882 return SDValue();
48883 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48884 if (!CmpConstant)
48885 return SDValue();
48886 const APInt &CmpVal = CmpConstant->getAPIntValue();
48887
48888 SDValue CmpOp = EFLAGS.getOperand(0);
48889 unsigned CmpBits = CmpOp.getValueSizeInBits();
48890 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48891
48892 // Peek through any truncate.
48893 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48894 CmpOp = CmpOp.getOperand(0);
48895
48896 // Bail if we don't find a MOVMSK.
48897 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48898 return SDValue();
48899
48900 SDValue Vec = CmpOp.getOperand(0);
48901 MVT VecVT = Vec.getSimpleValueType();
48902 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48903 "Unexpected MOVMSK operand");
48904 unsigned NumElts = VecVT.getVectorNumElements();
48905 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48906
48907 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48908 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48909 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48910 if (!IsAnyOf && !IsAllOf)
48911 return SDValue();
48912
48913 // TODO: Check more combining cases for me.
48914 // Here we check the cmp use number to decide do combining or not.
48915 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48916 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48917 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48918
48919 // See if we can peek through to a vector with a wider element type, if the
48920 // signbits extend down to all the sub-elements as well.
48921 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48922 // potential SimplifyDemandedBits/Elts cases.
48923 // If we looked through a truncate that discard bits, we can't do this
48924 // transform.
48925 // FIXME: We could do this transform for truncates that discarded bits by
48926 // inserting an AND mask between the new MOVMSK and the CMP.
48927 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48928 SDValue BC = peekThroughBitcasts(Vec);
48929 MVT BCVT = BC.getSimpleValueType();
48930 unsigned BCNumElts = BCVT.getVectorNumElements();
48931 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48932 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48933 BCNumEltBits > NumEltBits &&
48934 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48935 SDLoc DL(EFLAGS);
48936 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48937 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48938 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48939 DAG.getConstant(CmpMask, DL, MVT::i32));
48940 }
48941 }
48942
48943 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48944 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48945 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48946 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48947 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48949 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48950 Ops.size() == 2) {
48951 SDLoc DL(EFLAGS);
48952 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48953 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48954 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48955 DAG.getBitcast(SubVT, Ops[0]),
48956 DAG.getBitcast(SubVT, Ops[1]));
48957 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48958 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48959 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48960 DAG.getConstant(CmpMask, DL, MVT::i32));
48961 }
48962 }
48963
48964 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48965 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48966 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48967 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48968 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48969 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48970 SDValue BC = peekThroughBitcasts(Vec);
48971 // Ensure MOVMSK was testing every signbit of BC.
48972 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48973 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48974 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48975 BC.getOperand(0), BC.getOperand(1));
48976 V = DAG.getBitcast(TestVT, V);
48977 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48978 }
48979 // Check for 256-bit split vector cases.
48980 if (BC.getOpcode() == ISD::AND &&
48981 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48982 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48983 SDValue LHS = BC.getOperand(0);
48984 SDValue RHS = BC.getOperand(1);
48985 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48986 LHS.getOperand(0), LHS.getOperand(1));
48987 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48988 RHS.getOperand(0), RHS.getOperand(1));
48989 LHS = DAG.getBitcast(TestVT, LHS);
48990 RHS = DAG.getBitcast(TestVT, RHS);
48991 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48992 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48993 }
48994 }
48995 }
48996
48997 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48998 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48999 // sign bits prior to the comparison with zero unless we know that
49000 // the vXi16 splats the sign bit down to the lower i8 half.
49001 // TODO: Handle all_of patterns.
49002 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49003 SDValue VecOp0 = Vec.getOperand(0);
49004 SDValue VecOp1 = Vec.getOperand(1);
49005 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49006 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49007 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49008 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49009 SDLoc DL(EFLAGS);
49010 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49011 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49012 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49013 if (!SignExt0) {
49014 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49015 DAG.getConstant(0xAAAA, DL, MVT::i16));
49016 }
49017 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49018 DAG.getConstant(0, DL, MVT::i16));
49019 }
49020 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49021 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49022 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49023 (IsAnyOf || (SignExt0 && SignExt1))) {
49024 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49025 SDLoc DL(EFLAGS);
49026 SDValue Result = peekThroughBitcasts(Src);
49027 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49028 Result.getValueType().getVectorNumElements() <= NumElts) {
49029 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49030 Result.getOperand(0), Result.getOperand(1));
49031 V = DAG.getBitcast(MVT::v4i64, V);
49032 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49033 }
49034 Result = DAG.getBitcast(MVT::v32i8, Result);
49035 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49036 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49037 if (!SignExt0 || !SignExt1) {
49038 assert(IsAnyOf &&
49039 "Only perform v16i16 signmasks for any_of patterns");
49040 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49041 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49042 }
49043 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49044 DAG.getConstant(CmpMask, DL, MVT::i32));
49045 }
49046 }
49047 }
49048
49049 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49050 // Since we peek through a bitcast, we need to be careful if the base vector
49051 // type has smaller elements than the MOVMSK type. In that case, even if
49052 // all the elements are demanded by the shuffle mask, only the "high"
49053 // elements which have highbits that align with highbits in the MOVMSK vec
49054 // elements are actually demanded. A simplification of spurious operations
49055 // on the "low" elements take place during other simplifications.
49056 //
49057 // For example:
49058 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49059 // demanded, because we are swapping around the result can change.
49060 //
49061 // To address this, we check that we can scale the shuffle mask to MOVMSK
49062 // element width (this will ensure "high" elements match). Its slightly overly
49063 // conservative, but fine for an edge case fold.
49064 SmallVector<int, 32> ShuffleMask;
49065 SmallVector<SDValue, 2> ShuffleInputs;
49066 if (NumElts <= CmpBits &&
49067 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49068 ShuffleMask, DAG) &&
49069 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49070 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49071 canScaleShuffleElements(ShuffleMask, NumElts)) {
49072 SDLoc DL(EFLAGS);
49073 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49074 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49075 Result =
49076 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49077 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49078 }
49079
49080 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49081 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49082 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49083 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49084 // iff every element is referenced.
49085 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49086 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49087 (NumEltBits == 32 || NumEltBits == 64)) {
49088 SDLoc DL(EFLAGS);
49089 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49090 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49091 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49092 SDValue LHS = Vec;
49093 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49094 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49095 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49096 DAG.getBitcast(FloatVT, LHS),
49097 DAG.getBitcast(FloatVT, RHS));
49098 }
49099
49100 return SDValue();
49101}
49102
49103/// Optimize an EFLAGS definition used according to the condition code \p CC
49104/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49105/// uses of chain values.
49107 SelectionDAG &DAG,
49108 const X86Subtarget &Subtarget) {
49109 if (CC == X86::COND_B)
49110 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49111 return Flags;
49112
49113 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49114 return R;
49115
49116 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49117 return R;
49118
49119 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49120 return R;
49121
49122 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49123 return R;
49124
49125 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49126}
49127
49128/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49131 const X86Subtarget &Subtarget) {
49132 SDLoc DL(N);
49133 EVT VT = N->getValueType(0);
49134 SDValue FalseOp = N->getOperand(0);
49135 SDValue TrueOp = N->getOperand(1);
49136 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49137 SDValue Cond = N->getOperand(3);
49138
49139 // cmov X, X, ?, ? --> X
49140 if (TrueOp == FalseOp)
49141 return TrueOp;
49142
49143 // Try to simplify the EFLAGS and condition code operands.
49144 // We can't always do this as FCMOV only supports a subset of X86 cond.
49145 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49146 if (!(FalseOp.getValueType() == MVT::f80 ||
49147 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49148 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49149 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49150 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49151 Flags};
49152 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49153 }
49154 }
49155
49156 // If this is a select between two integer constants, try to do some
49157 // optimizations. Note that the operands are ordered the opposite of SELECT
49158 // operands.
49159 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49160 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49161 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49162 // larger than FalseC (the false value).
49163 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49165 std::swap(TrueC, FalseC);
49166 std::swap(TrueOp, FalseOp);
49167 }
49168
49169 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49170 // This is efficient for any integer data type (including i8/i16) and
49171 // shift amount.
49172 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49173 Cond = getSETCC(CC, Cond, DL, DAG);
49174
49175 // Zero extend the condition if needed.
49176 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49177
49178 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49179 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49180 DAG.getConstant(ShAmt, DL, MVT::i8));
49181 return Cond;
49182 }
49183
49184 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49185 // for any integer data type, including i8/i16.
49186 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49187 Cond = getSETCC(CC, Cond, DL, DAG);
49188
49189 // Zero extend the condition if needed.
49191 FalseC->getValueType(0), Cond);
49192 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49193 SDValue(FalseC, 0));
49194 return Cond;
49195 }
49196
49197 // Optimize cases that will turn into an LEA instruction. This requires
49198 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49199 if (VT == MVT::i32 || VT == MVT::i64) {
49200 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49201 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49202 "Implicit constant truncation");
49203
49204 bool isFastMultiplier = false;
49205 if (Diff.ult(10)) {
49206 switch (Diff.getZExtValue()) {
49207 default: break;
49208 case 1: // result = add base, cond
49209 case 2: // result = lea base( , cond*2)
49210 case 3: // result = lea base(cond, cond*2)
49211 case 4: // result = lea base( , cond*4)
49212 case 5: // result = lea base(cond, cond*4)
49213 case 8: // result = lea base( , cond*8)
49214 case 9: // result = lea base(cond, cond*8)
49215 isFastMultiplier = true;
49216 break;
49217 }
49218 }
49219
49220 if (isFastMultiplier) {
49221 Cond = getSETCC(CC, Cond, DL ,DAG);
49222 // Zero extend the condition if needed.
49223 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49224 Cond);
49225 // Scale the condition by the difference.
49226 if (Diff != 1)
49227 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49228 DAG.getConstant(Diff, DL, Cond.getValueType()));
49229
49230 // Add the base if non-zero.
49231 if (FalseC->getAPIntValue() != 0)
49232 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49233 SDValue(FalseC, 0));
49234 return Cond;
49235 }
49236 }
49237 }
49238 }
49239
49240 // Handle these cases:
49241 // (select (x != c), e, c) -> select (x != c), e, x),
49242 // (select (x == c), c, e) -> select (x == c), x, e)
49243 // where the c is an integer constant, and the "select" is the combination
49244 // of CMOV and CMP.
49245 //
49246 // The rationale for this change is that the conditional-move from a constant
49247 // needs two instructions, however, conditional-move from a register needs
49248 // only one instruction.
49249 //
49250 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49251 // some instruction-combining opportunities. This opt needs to be
49252 // postponed as late as possible.
49253 //
49254 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49255 // the DCI.xxxx conditions are provided to postpone the optimization as
49256 // late as possible.
49257
49258 ConstantSDNode *CmpAgainst = nullptr;
49259 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49260 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49261 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49262
49263 if (CC == X86::COND_NE &&
49264 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49266 std::swap(TrueOp, FalseOp);
49267 }
49268
49269 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49270 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49271 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49272 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49273 }
49274 }
49275 }
49276
49277 // Transform:
49278 //
49279 // (cmov 1 T (uge T 2))
49280 //
49281 // to:
49282 //
49283 // (adc T 0 (sub T 1))
49284 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49285 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49286 SDValue Cond0 = Cond.getOperand(0);
49287 if (Cond0.getOpcode() == ISD::TRUNCATE)
49288 Cond0 = Cond0.getOperand(0);
49289 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49290 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49291 EVT CondVT = Cond->getValueType(0);
49292 // Subtract 1 and generate a carry.
49293 SDValue NewSub =
49294 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49295 DAG.getConstant(1, DL, CondVT));
49296 SDValue EFLAGS(NewSub.getNode(), 1);
49297 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49298 DAG.getConstant(0, DL, VT), EFLAGS);
49299 }
49300 }
49301
49302 // Fold and/or of setcc's to double CMOV:
49303 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49304 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49305 //
49306 // This combine lets us generate:
49307 // cmovcc1 (jcc1 if we don't have CMOV)
49308 // cmovcc2 (same)
49309 // instead of:
49310 // setcc1
49311 // setcc2
49312 // and/or
49313 // cmovne (jne if we don't have CMOV)
49314 // When we can't use the CMOV instruction, it might increase branch
49315 // mispredicts.
49316 // When we can use CMOV, or when there is no mispredict, this improves
49317 // throughput and reduces register pressure.
49318 //
49319 if (CC == X86::COND_NE) {
49320 SDValue Flags;
49321 X86::CondCode CC0, CC1;
49322 bool isAndSetCC;
49323 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49324 if (isAndSetCC) {
49325 std::swap(FalseOp, TrueOp);
49328 }
49329
49330 SDValue LOps[] = {FalseOp, TrueOp,
49331 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49332 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49333 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49334 Flags};
49335 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49336 return CMOV;
49337 }
49338 }
49339
49340 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49341 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49342 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49343 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49344 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49345 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49346 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49347 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49348 SDValue Add = TrueOp;
49349 SDValue Const = FalseOp;
49350 // Canonicalize the condition code for easier matching and output.
49351 if (CC == X86::COND_E)
49352 std::swap(Add, Const);
49353
49354 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49355 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49356 Add.getResNo() == 0 && Add.hasOneUse() &&
49357 Add.getOperand(1) == Cond.getOperand(0)) {
49358 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49359 Add.getOperand(1));
49360 }
49361
49362 // We might have replaced the constant in the cmov with the LHS of the
49363 // compare. If so change it to the RHS of the compare.
49364 if (Const == Cond.getOperand(0))
49365 Const = Cond.getOperand(1);
49366
49367 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49368 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49369 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49370 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49371 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49372 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49373 // This should constant fold.
49374 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49375 SDValue CMov =
49376 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49377 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49378 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49379 }
49380 }
49381
49382 return SDValue();
49383}
49384
49385/// Different mul shrinking modes.
49387
49389 EVT VT = N->getOperand(0).getValueType();
49390 if (VT.getScalarSizeInBits() != 32)
49391 return false;
49392
49393 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49394 unsigned SignBits[2] = {1, 1};
49395 bool IsPositive[2] = {false, false};
49396 for (unsigned i = 0; i < 2; i++) {
49397 SDValue Opd = N->getOperand(i);
49398
49399 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49400 IsPositive[i] = DAG.SignBitIsZero(Opd);
49401 }
49402
49403 bool AllPositive = IsPositive[0] && IsPositive[1];
49404 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49405 // When ranges are from -128 ~ 127, use MULS8 mode.
49406 if (MinSignBits >= 25)
49408 // When ranges are from 0 ~ 255, use MULU8 mode.
49409 else if (AllPositive && MinSignBits >= 24)
49411 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49412 else if (MinSignBits >= 17)
49414 // When ranges are from 0 ~ 65535, use MULU16 mode.
49415 else if (AllPositive && MinSignBits >= 16)
49417 else
49418 return false;
49419 return true;
49420}
49421
49422/// When the operands of vector mul are extended from smaller size values,
49423/// like i8 and i16, the type of mul may be shrinked to generate more
49424/// efficient code. Two typical patterns are handled:
49425/// Pattern1:
49426/// %2 = sext/zext <N x i8> %1 to <N x i32>
49427/// %4 = sext/zext <N x i8> %3 to <N x i32>
49428// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49429/// %5 = mul <N x i32> %2, %4
49430///
49431/// Pattern2:
49432/// %2 = zext/sext <N x i16> %1 to <N x i32>
49433/// %4 = zext/sext <N x i16> %3 to <N x i32>
49434/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49435/// %5 = mul <N x i32> %2, %4
49436///
49437/// There are four mul shrinking modes:
49438/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49439/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49440/// generate pmullw+sext32 for it (MULS8 mode).
49441/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49442/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49443/// generate pmullw+zext32 for it (MULU8 mode).
49444/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49445/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49446/// generate pmullw+pmulhw for it (MULS16 mode).
49447/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49448/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49449/// generate pmullw+pmulhuw for it (MULU16 mode).
49451 const X86Subtarget &Subtarget) {
49452 // Check for legality
49453 // pmullw/pmulhw are not supported by SSE.
49454 if (!Subtarget.hasSSE2())
49455 return SDValue();
49456
49457 // Check for profitability
49458 // pmulld is supported since SSE41. It is better to use pmulld
49459 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49460 // the expansion.
49461 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49462 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49463 return SDValue();
49464
49466 if (!canReduceVMulWidth(N, DAG, Mode))
49467 return SDValue();
49468
49469 SDValue N0 = N->getOperand(0);
49470 SDValue N1 = N->getOperand(1);
49471 EVT VT = N->getOperand(0).getValueType();
49472 unsigned NumElts = VT.getVectorNumElements();
49473 if ((NumElts % 2) != 0)
49474 return SDValue();
49475
49476 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49477
49478 // Shrink the operands of mul.
49479 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49480 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49481
49482 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49483 // lower part is needed.
49484 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49488 DL, VT, MulLo);
49489
49490 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49491 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49492 // the higher part is also needed.
49493 SDValue MulHi =
49495 ReducedVT, NewN0, NewN1);
49496
49497 // Repack the lower part and higher part result of mul into a wider
49498 // result.
49499 // Generate shuffle functioning as punpcklwd.
49500 SmallVector<int, 16> ShuffleMask(NumElts);
49501 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49502 ShuffleMask[2 * i] = i;
49503 ShuffleMask[2 * i + 1] = i + NumElts;
49504 }
49505 SDValue ResLo =
49506 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49507 ResLo = DAG.getBitcast(ResVT, ResLo);
49508 // Generate shuffle functioning as punpckhwd.
49509 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49510 ShuffleMask[2 * i] = i + NumElts / 2;
49511 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49512 }
49513 SDValue ResHi =
49514 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49515 ResHi = DAG.getBitcast(ResVT, ResHi);
49516 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49517}
49518
49520 EVT VT, const SDLoc &DL) {
49521
49522 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49523 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49524 DAG.getConstant(Mult, DL, VT));
49525 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49526 DAG.getConstant(Shift, DL, MVT::i8));
49527 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49528 N->getOperand(0));
49529 return Result;
49530 };
49531
49532 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49533 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49534 DAG.getConstant(Mul1, DL, VT));
49535 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49536 DAG.getConstant(Mul2, DL, VT));
49537 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49538 N->getOperand(0));
49539 return Result;
49540 };
49541
49542 switch (MulAmt) {
49543 default:
49544 break;
49545 case 11:
49546 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49547 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49548 case 21:
49549 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49550 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49551 case 41:
49552 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49553 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49554 case 22:
49555 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49556 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49557 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49558 case 19:
49559 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49560 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49561 case 37:
49562 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49563 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49564 case 73:
49565 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49566 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49567 case 13:
49568 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49569 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49570 case 23:
49571 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49572 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49573 case 26:
49574 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49575 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49576 case 28:
49577 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49578 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49579 case 29:
49580 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49581 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49582 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49583 }
49584
49585 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49586 // by a single LEA.
49587 // First check if this a sum of two power of 2s because that's easy. Then
49588 // count how many zeros are up to the first bit.
49589 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49590 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49591 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49592 if (ScaleShift >= 1 && ScaleShift < 4) {
49593 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49594 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49595 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49596 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49597 DAG.getConstant(ScaleShift, DL, MVT::i8));
49598 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49599 }
49600 }
49601
49602 return SDValue();
49603}
49604
49605// If the upper 17 bits of either element are zero and the other element are
49606// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49607// PMULLD, except on KNL.
49609 SelectionDAG &DAG,
49610 const X86Subtarget &Subtarget) {
49611 if (!Subtarget.hasSSE2())
49612 return SDValue();
49613
49614 if (Subtarget.isPMADDWDSlow())
49615 return SDValue();
49616
49617 EVT VT = N->getValueType(0);
49618
49619 // Only support vXi32 vectors.
49620 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49621 return SDValue();
49622
49623 // Make sure the type is legal or can split/widen to a legal type.
49624 // With AVX512 but without BWI, we would need to split v32i16.
49625 unsigned NumElts = VT.getVectorNumElements();
49626 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49627 return SDValue();
49628
49629 // With AVX512 but without BWI, we would need to split v32i16.
49630 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49631 return SDValue();
49632
49633 SDValue N0 = N->getOperand(0);
49634 SDValue N1 = N->getOperand(1);
49635
49636 // If we are zero/sign extending two steps without SSE4.1, its better to
49637 // reduce the vmul width instead.
49638 if (!Subtarget.hasSSE41() &&
49639 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49640 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49641 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49642 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49643 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49644 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49645 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49646 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49647 return SDValue();
49648
49649 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49650 // the vmul width instead.
49651 if (!Subtarget.hasSSE41() &&
49652 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49653 N0.getOperand(0).getValueSizeInBits() > 128) &&
49654 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49655 N1.getOperand(0).getValueSizeInBits() > 128))
49656 return SDValue();
49657
49658 // Sign bits must extend down to the lowest i16.
49659 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49660 DAG.ComputeMaxSignificantBits(N0) > 16)
49661 return SDValue();
49662
49663 // At least one of the elements must be zero in the upper 17 bits, or can be
49664 // safely made zero without altering the final result.
49665 auto GetZeroableOp = [&](SDValue Op) {
49666 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49667 if (DAG.MaskedValueIsZero(Op, Mask17))
49668 return Op;
49669 // Mask off upper 16-bits of sign-extended constants.
49671 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49672 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49673 SDValue Src = Op.getOperand(0);
49674 // Convert sext(vXi16) to zext(vXi16).
49675 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49676 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49677 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49678 // which will expand the extension.
49679 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49680 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49681 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49682 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49683 }
49684 }
49685 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49686 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49687 N->isOnlyUserOf(Op.getNode())) {
49688 SDValue Src = Op.getOperand(0);
49689 if (Src.getScalarValueSizeInBits() == 16)
49690 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49691 }
49692 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49693 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49694 N->isOnlyUserOf(Op.getNode())) {
49695 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49696 Op.getOperand(1));
49697 }
49698 return SDValue();
49699 };
49700 SDValue ZeroN0 = GetZeroableOp(N0);
49701 SDValue ZeroN1 = GetZeroableOp(N1);
49702 if (!ZeroN0 && !ZeroN1)
49703 return SDValue();
49704 N0 = ZeroN0 ? ZeroN0 : N0;
49705 N1 = ZeroN1 ? ZeroN1 : N1;
49706
49707 // Use SplitOpsAndApply to handle AVX splitting.
49708 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49710 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49711 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49712 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49713 DAG.getBitcast(OpVT, Ops[0]),
49714 DAG.getBitcast(OpVT, Ops[1]));
49715 };
49716 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49717}
49718
49720 const X86Subtarget &Subtarget) {
49721 if (!Subtarget.hasSSE2())
49722 return SDValue();
49723
49724 EVT VT = N->getValueType(0);
49725
49726 // Only support vXi64 vectors.
49727 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49728 VT.getVectorNumElements() < 2 ||
49730 return SDValue();
49731
49732 SDValue N0 = N->getOperand(0);
49733 SDValue N1 = N->getOperand(1);
49734
49735 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49736 // 32-bits. We can lower with this if the sign bits stretch that far.
49737 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49738 DAG.ComputeNumSignBits(N1) > 32) {
49739 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49741 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49742 };
49743 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49744 /*CheckBWI*/ false);
49745 }
49746
49747 // If the upper bits are zero we can use a single pmuludq.
49748 APInt Mask = APInt::getHighBitsSet(64, 32);
49749 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49750 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49752 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49753 };
49754 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49755 /*CheckBWI*/ false);
49756 }
49757
49758 return SDValue();
49759}
49760
49763 const X86Subtarget &Subtarget) {
49764 EVT VT = N->getValueType(0);
49765 SDLoc DL(N);
49766
49767 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49768 return V;
49769
49770 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49771 return V;
49772
49773 if (DCI.isBeforeLegalize() && VT.isVector())
49774 return reduceVMULWidth(N, DL, DAG, Subtarget);
49775
49776 if (VT != MVT::i64 && VT != MVT::i32 &&
49777 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49778 return SDValue();
49779
49780 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49781 if (!Known1.isConstant())
49782 return SDValue();
49783
49784 const APInt &C = Known1.getConstant();
49785 if (C.isZero())
49786 return DAG.getConstant(0, DL, VT);
49787
49788 if (C.isAllOnes())
49789 return DAG.getNegative(N->getOperand(0), DL, VT);
49790
49791 if (isPowerOf2_64(C.getZExtValue()))
49792 return SDValue();
49793
49794 // Optimize a single multiply with constant into two operations in order to
49795 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49797 return SDValue();
49798
49799 // An imul is usually smaller than the alternative sequence.
49801 return SDValue();
49802
49803 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49804 return SDValue();
49805
49806 int64_t SignMulAmt = C.getSExtValue();
49807 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49808 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49809
49810 SDValue NewMul = SDValue();
49811 if (VT == MVT::i64 || VT == MVT::i32) {
49812 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49813 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49814 DAG.getConstant(AbsMulAmt, DL, VT));
49815 if (SignMulAmt < 0)
49816 NewMul = DAG.getNegative(NewMul, DL, VT);
49817
49818 return NewMul;
49819 }
49820
49821 uint64_t MulAmt1 = 0;
49822 uint64_t MulAmt2 = 0;
49823 if ((AbsMulAmt % 9) == 0) {
49824 MulAmt1 = 9;
49825 MulAmt2 = AbsMulAmt / 9;
49826 } else if ((AbsMulAmt % 5) == 0) {
49827 MulAmt1 = 5;
49828 MulAmt2 = AbsMulAmt / 5;
49829 } else if ((AbsMulAmt % 3) == 0) {
49830 MulAmt1 = 3;
49831 MulAmt2 = AbsMulAmt / 3;
49832 }
49833
49834 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49835 if (MulAmt2 &&
49836 (isPowerOf2_64(MulAmt2) ||
49837 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49838
49839 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49840 N->user_begin()->getOpcode() == ISD::ADD))
49841 // If second multiplifer is pow2, issue it first. We want the multiply
49842 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49843 // use is an add. Only do this for positive multiply amounts since the
49844 // negate would prevent it from being used as an address mode anyway.
49845 std::swap(MulAmt1, MulAmt2);
49846
49847 if (isPowerOf2_64(MulAmt1))
49848 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49849 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49850 else
49851 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49852 DAG.getConstant(MulAmt1, DL, VT));
49853
49854 if (isPowerOf2_64(MulAmt2))
49855 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49856 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49857 else
49858 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49859 DAG.getConstant(MulAmt2, DL, VT));
49860
49861 // Negate the result.
49862 if (SignMulAmt < 0)
49863 NewMul = DAG.getNegative(NewMul, DL, VT);
49864 } else if (!Subtarget.slowLEA())
49865 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49866 }
49867 if (!NewMul) {
49868 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49869 if (isPowerOf2_64(AbsMulAmt - 1)) {
49870 // (mul x, 2^N + 1) => (add (shl x, N), x)
49871 NewMul = DAG.getNode(
49872 ISD::ADD, DL, VT, N->getOperand(0),
49873 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49874 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49875 if (SignMulAmt < 0)
49876 NewMul = DAG.getNegative(NewMul, DL, VT);
49877 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49878 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49879 NewMul =
49880 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49881 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49882 // To negate, reverse the operands of the subtract.
49883 if (SignMulAmt < 0)
49884 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49885 else
49886 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49887 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49888 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49889 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49890 NewMul =
49891 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49892 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49893 NewMul = DAG.getNode(
49894 ISD::ADD, DL, VT, NewMul,
49895 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49896 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49897 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49898 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49899 NewMul =
49900 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49901 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49902 NewMul = DAG.getNode(
49903 ISD::SUB, DL, VT, NewMul,
49904 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49905 } else if (SignMulAmt >= 0 && VT.isVector() &&
49906 Subtarget.fastImmVectorShift()) {
49907 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49908 uint64_t ShiftAmt1;
49909 std::optional<unsigned> Opc;
49910 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49911 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49912 Opc = ISD::ADD;
49913 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49914 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49915 Opc = ISD::SUB;
49916 }
49917
49918 if (Opc) {
49919 SDValue Shift1 =
49920 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49921 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49922 SDValue Shift2 =
49923 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49924 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49925 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49926 }
49927 }
49928 }
49929
49930 return NewMul;
49931}
49932
49933// Try to form a MULHU or MULHS node by looking for
49934// (srl (mul ext, ext), 16)
49935// TODO: This is X86 specific because we want to be able to handle wide types
49936// before type legalization. But we can only do it if the vector will be
49937// legalized via widening/splitting. Type legalization can't handle promotion
49938// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49939// combiner.
49941 const SDLoc &DL,
49942 const X86Subtarget &Subtarget) {
49943 using namespace SDPatternMatch;
49944 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49945 "SRL or SRA node is required here!");
49946
49947 if (!Subtarget.hasSSE2())
49948 return SDValue();
49949
49950 // Input type should be at least vXi32.
49951 EVT VT = N->getValueType(0);
49952 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49953 return SDValue();
49954
49955 // The operation must be a multiply shifted right by 16.
49956 SDValue LHS, RHS;
49957 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49958 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49959 return SDValue();
49960
49961 unsigned ExtOpc = LHS.getOpcode();
49962 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49963 RHS.getOpcode() != ExtOpc)
49964 return SDValue();
49965
49966 // Peek through the extends.
49967 LHS = LHS.getOperand(0);
49968 RHS = RHS.getOperand(0);
49969
49970 // Ensure the input types match.
49971 EVT MulVT = LHS.getValueType();
49972 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49973 return SDValue();
49974
49975 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49976 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49977
49978 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49979 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49980}
49981
49983 const X86Subtarget &Subtarget) {
49984 using namespace llvm::SDPatternMatch;
49985 SDValue N0 = N->getOperand(0);
49986 SDValue N1 = N->getOperand(1);
49988 EVT VT = N0.getValueType();
49989 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49990 SDLoc DL(N);
49991
49992 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49993 // with out-of-bounds clamping.
49994 if (N0.getOpcode() == ISD::VSELECT &&
49995 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49996 SDValue Cond = N0.getOperand(0);
49997 SDValue N00 = N0.getOperand(1);
49998 SDValue N01 = N0.getOperand(2);
49999 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50001 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50003 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50004 }
50005 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50007 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50009 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50010 }
50011 }
50012
50013 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50014 // since the result of setcc_c is all zero's or all ones.
50015 if (VT.isInteger() && !VT.isVector() &&
50016 N1C && N0.getOpcode() == ISD::AND &&
50017 N0.getOperand(1).getOpcode() == ISD::Constant) {
50018 SDValue N00 = N0.getOperand(0);
50019 APInt Mask = N0.getConstantOperandAPInt(1);
50020 Mask <<= N1C->getAPIntValue();
50021 bool MaskOK = false;
50022 // We can handle cases concerning bit-widening nodes containing setcc_c if
50023 // we carefully interrogate the mask to make sure we are semantics
50024 // preserving.
50025 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50026 // of the underlying setcc_c operation if the setcc_c was zero extended.
50027 // Consider the following example:
50028 // zext(setcc_c) -> i32 0x0000FFFF
50029 // c1 -> i32 0x0000FFFF
50030 // c2 -> i32 0x00000001
50031 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50032 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50033 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50034 MaskOK = true;
50035 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50037 MaskOK = true;
50038 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50039 N00.getOpcode() == ISD::ANY_EXTEND) &&
50041 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50042 }
50043 if (MaskOK && Mask != 0)
50044 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50045 }
50046
50047 return SDValue();
50048}
50049
50051 const X86Subtarget &Subtarget) {
50052 using namespace llvm::SDPatternMatch;
50053 SDValue N0 = N->getOperand(0);
50054 SDValue N1 = N->getOperand(1);
50055 EVT VT = N0.getValueType();
50056 unsigned Size = VT.getSizeInBits();
50057 SDLoc DL(N);
50058
50059 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50060 return V;
50061
50062 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50063 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50064 SDValue ShrAmtVal;
50065 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50067 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50068 }
50069
50070 // fold (SRA (SHL X, ShlConst), SraConst)
50071 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50072 // or (sext_in_reg X)
50073 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50074 // depending on relation between SraConst and ShlConst.
50075 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50076 // us to do the sext_in_reg from corresponding bit.
50077
50078 // sexts in X86 are MOVs. The MOVs have the same code size
50079 // as above SHIFTs (only SHIFT on 1 has lower code size).
50080 // However the MOVs have 2 advantages to a SHIFT:
50081 // 1. MOVs can write to a register that differs from source
50082 // 2. MOVs accept memory operands
50083
50084 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50085 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50087 return SDValue();
50088
50089 SDValue N00 = N0.getOperand(0);
50090 SDValue N01 = N0.getOperand(1);
50091 APInt ShlConst = N01->getAsAPIntVal();
50092 APInt SraConst = N1->getAsAPIntVal();
50093 EVT CVT = N1.getValueType();
50094
50095 if (CVT != N01.getValueType())
50096 return SDValue();
50097 if (SraConst.isNegative())
50098 return SDValue();
50099
50100 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50101 unsigned ShiftSize = SVT.getSizeInBits();
50102 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50103 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50104 continue;
50105 SDValue NN =
50106 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50107 if (SraConst.eq(ShlConst))
50108 return NN;
50109 if (SraConst.ult(ShlConst))
50110 return DAG.getNode(ISD::SHL, DL, VT, NN,
50111 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50112 return DAG.getNode(ISD::SRA, DL, VT, NN,
50113 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50114 }
50115 return SDValue();
50116}
50117
50120 const X86Subtarget &Subtarget) {
50121 using namespace llvm::SDPatternMatch;
50122 SDValue N0 = N->getOperand(0);
50123 SDValue N1 = N->getOperand(1);
50124 EVT VT = N0.getValueType();
50125 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50126 SDLoc DL(N);
50127
50128 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50129 return V;
50130
50131 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50132 // with out-of-bounds clamping.
50133 if (N0.getOpcode() == ISD::VSELECT &&
50134 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50135 SDValue Cond = N0.getOperand(0);
50136 SDValue N00 = N0.getOperand(1);
50137 SDValue N01 = N0.getOperand(2);
50138 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50140 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50142 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50143 }
50144 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50146 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50148 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50149 }
50150 }
50151
50152 // Only do this on the last DAG combine as it can interfere with other
50153 // combines.
50154 if (!DCI.isAfterLegalizeDAG())
50155 return SDValue();
50156
50157 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50158 // TODO: This is a generic DAG combine that became an x86-only combine to
50159 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50160 // and-not ('andn').
50161 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50162 return SDValue();
50163
50164 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50165 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50166 if (!ShiftC || !AndC)
50167 return SDValue();
50168
50169 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50170 // transform should reduce code size. It may also enable secondary transforms
50171 // from improved known-bits analysis or instruction selection.
50172 APInt MaskVal = AndC->getAPIntValue();
50173
50174 // If this can be matched by a zero extend, don't optimize.
50175 if (MaskVal.isMask()) {
50176 unsigned TO = MaskVal.countr_one();
50177 if (TO >= 8 && isPowerOf2_32(TO))
50178 return SDValue();
50179 }
50180
50181 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50182 unsigned OldMaskSize = MaskVal.getSignificantBits();
50183 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50184 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50185 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50186 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50187 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50188 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50189 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50190 }
50191 return SDValue();
50192}
50193
50195 const X86Subtarget &Subtarget) {
50196 unsigned Opcode = N->getOpcode();
50197 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50198
50199 SDLoc DL(N);
50200 EVT VT = N->getValueType(0);
50201 SDValue N0 = N->getOperand(0);
50202 SDValue N1 = N->getOperand(1);
50203 EVT SrcVT = N0.getValueType();
50204
50205 SDValue BC0 =
50206 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50207 SDValue BC1 =
50208 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50209
50210 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50211 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50212 // truncation trees that help us avoid lane crossing shuffles.
50213 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50214 // TODO: We don't handle vXf64 shuffles yet.
50215 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50216 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50218 SmallVector<int> ShuffleMask, ScaledMask;
50219 SDValue Vec = peekThroughBitcasts(BCSrc);
50220 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50222 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50223 // shuffle to a v4X64 width - we can probably relax this in the future.
50224 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50225 ShuffleOps[0].getValueType().is256BitVector() &&
50226 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50227 SDValue Lo, Hi;
50228 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50229 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50230 Lo = DAG.getBitcast(SrcVT, Lo);
50231 Hi = DAG.getBitcast(SrcVT, Hi);
50232 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50233 Res = DAG.getBitcast(ShufVT, Res);
50234 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50235 return DAG.getBitcast(VT, Res);
50236 }
50237 }
50238 }
50239 }
50240
50241 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50242 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50243 // If either/both ops are a shuffle that can scale to v2x64,
50244 // then see if we can perform this as a v4x32 post shuffle.
50245 SmallVector<SDValue> Ops0, Ops1;
50246 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50247 bool IsShuf0 =
50248 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50249 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50250 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50251 bool IsShuf1 =
50252 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50253 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50254 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50255 if (IsShuf0 || IsShuf1) {
50256 if (!IsShuf0) {
50257 Ops0.assign({BC0});
50258 ScaledMask0.assign({0, 1});
50259 }
50260 if (!IsShuf1) {
50261 Ops1.assign({BC1});
50262 ScaledMask1.assign({0, 1});
50263 }
50264
50265 SDValue LHS, RHS;
50266 int PostShuffle[4] = {-1, -1, -1, -1};
50267 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50268 if (M < 0)
50269 return true;
50270 Idx = M % 2;
50271 SDValue Src = Ops[M / 2];
50272 if (!LHS || LHS == Src) {
50273 LHS = Src;
50274 return true;
50275 }
50276 if (!RHS || RHS == Src) {
50277 Idx += 2;
50278 RHS = Src;
50279 return true;
50280 }
50281 return false;
50282 };
50283 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50284 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50285 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50286 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50287 LHS = DAG.getBitcast(SrcVT, LHS);
50288 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50289 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50290 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50291 Res = DAG.getBitcast(ShufVT, Res);
50292 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50293 return DAG.getBitcast(VT, Res);
50294 }
50295 }
50296 }
50297
50298 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50299 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50300 SmallVector<int> Mask0, Mask1;
50301 SmallVector<SDValue> Ops0, Ops1;
50302 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50303 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50304 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50305 !Ops0.empty() && !Ops1.empty() &&
50306 all_of(Ops0,
50307 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50308 all_of(Ops1,
50309 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50310 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50311 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50312 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50313 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50314 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50315 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50316 if ((Op00 == Op11) && (Op01 == Op10)) {
50317 std::swap(Op10, Op11);
50319 }
50320 if ((Op00 == Op10) && (Op01 == Op11)) {
50321 const int Map[4] = {0, 2, 1, 3};
50322 SmallVector<int, 4> ShuffleMask(
50323 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50324 Map[ScaledMask1[1]]});
50325 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50326 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50327 DAG.getBitcast(SrcVT, Op01));
50328 Res = DAG.getBitcast(ShufVT, Res);
50329 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50330 return DAG.getBitcast(VT, Res);
50331 }
50332 }
50333 }
50334
50335 return SDValue();
50336}
50337
50340 const X86Subtarget &Subtarget) {
50341 unsigned Opcode = N->getOpcode();
50342 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50343 "Unexpected pack opcode");
50344
50345 EVT VT = N->getValueType(0);
50346 SDValue N0 = N->getOperand(0);
50347 SDValue N1 = N->getOperand(1);
50348 unsigned NumDstElts = VT.getVectorNumElements();
50349 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50350 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50351 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50352 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50353 "Unexpected PACKSS/PACKUS input type");
50354
50355 bool IsSigned = (X86ISD::PACKSS == Opcode);
50356
50357 // Constant Folding.
50358 APInt UndefElts0, UndefElts1;
50359 SmallVector<APInt, 32> EltBits0, EltBits1;
50360 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50361 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50362 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50363 /*AllowWholeUndefs*/ true,
50364 /*AllowPartialUndefs*/ true) &&
50365 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50366 /*AllowWholeUndefs*/ true,
50367 /*AllowPartialUndefs*/ true)) {
50368 unsigned NumLanes = VT.getSizeInBits() / 128;
50369 unsigned NumSrcElts = NumDstElts / 2;
50370 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50371 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50372
50373 APInt Undefs(NumDstElts, 0);
50374 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50375 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50376 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50377 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50378 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50379 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50380
50381 if (UndefElts[SrcIdx]) {
50382 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50383 continue;
50384 }
50385
50386 APInt &Val = EltBits[SrcIdx];
50387 if (IsSigned) {
50388 // PACKSS: Truncate signed value with signed saturation.
50389 // Source values less than dst minint are saturated to minint.
50390 // Source values greater than dst maxint are saturated to maxint.
50391 Val = Val.truncSSat(DstBitsPerElt);
50392 } else {
50393 // PACKUS: Truncate signed value with unsigned saturation.
50394 // Source values less than zero are saturated to zero.
50395 // Source values greater than dst maxuint are saturated to maxuint.
50396 // NOTE: This is different from APInt::truncUSat.
50397 if (Val.isIntN(DstBitsPerElt))
50398 Val = Val.trunc(DstBitsPerElt);
50399 else if (Val.isNegative())
50400 Val = APInt::getZero(DstBitsPerElt);
50401 else
50402 Val = APInt::getAllOnes(DstBitsPerElt);
50403 }
50404 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50405 }
50406 }
50407
50408 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50409 }
50410
50411 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50412 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50413 return V;
50414
50415 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50416 // Currently limit this to allsignbits cases only.
50417 if (IsSigned &&
50418 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50419 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50420 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50421 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50422 if (Not0 && Not1) {
50423 SDLoc DL(N);
50424 MVT SrcVT = N0.getSimpleValueType();
50425 SDValue Pack =
50426 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50427 DAG.getBitcast(SrcVT, Not1));
50428 return DAG.getNOT(DL, Pack, VT);
50429 }
50430 }
50431
50432 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50433 // truncate to create a larger truncate.
50434 if (Subtarget.hasAVX512() &&
50435 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50436 N0.getOperand(0).getValueType() == MVT::v8i32) {
50437 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50438 (!IsSigned &&
50439 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50440 if (Subtarget.hasVLX())
50441 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50442
50443 // Widen input to v16i32 so we can truncate that.
50444 SDLoc dl(N);
50445 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50446 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50447 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50448 }
50449 }
50450
50451 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50452 if (VT.is128BitVector()) {
50453 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50454 SDValue Src0, Src1;
50455 if (N0.getOpcode() == ExtOpc &&
50457 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50458 Src0 = N0.getOperand(0);
50459 }
50460 if (N1.getOpcode() == ExtOpc &&
50462 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50463 Src1 = N1.getOperand(0);
50464 }
50465 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50466 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50467 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50468 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50469 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50470 }
50471
50472 // Try again with pack(*_extend_vector_inreg, undef).
50473 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50475 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50476 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50477 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50478 DAG);
50479 }
50480
50481 // Attempt to combine as shuffle.
50482 SDValue Op(N, 0);
50483 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50484 return Res;
50485
50486 return SDValue();
50487}
50488
50491 const X86Subtarget &Subtarget) {
50492 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50493 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50494 "Unexpected horizontal add/sub opcode");
50495
50496 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50497 MVT VT = N->getSimpleValueType(0);
50498 SDValue LHS = N->getOperand(0);
50499 SDValue RHS = N->getOperand(1);
50500
50501 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50502 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50503 LHS.getOpcode() == RHS.getOpcode() &&
50504 LHS.getValueType() == RHS.getValueType() &&
50505 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50506 SDValue LHS0 = LHS.getOperand(0);
50507 SDValue LHS1 = LHS.getOperand(1);
50508 SDValue RHS0 = RHS.getOperand(0);
50509 SDValue RHS1 = RHS.getOperand(1);
50510 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50511 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50512 SDLoc DL(N);
50513 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50514 LHS0.isUndef() ? LHS1 : LHS0,
50515 RHS0.isUndef() ? RHS1 : RHS0);
50516 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50517 Res = DAG.getBitcast(ShufVT, Res);
50518 SDValue NewLHS =
50519 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50520 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50521 SDValue NewRHS =
50522 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50523 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50524 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50525 DAG.getBitcast(VT, NewRHS));
50526 }
50527 }
50528 }
50529
50530 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50531 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50532 return V;
50533
50534 return SDValue();
50535}
50536
50539 const X86Subtarget &Subtarget) {
50540 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50541 X86ISD::VSRL == N->getOpcode()) &&
50542 "Unexpected shift opcode");
50543 EVT VT = N->getValueType(0);
50544 SDValue N0 = N->getOperand(0);
50545 SDValue N1 = N->getOperand(1);
50546
50547 // Shift zero -> zero.
50549 return DAG.getConstant(0, SDLoc(N), VT);
50550
50551 // Detect constant shift amounts.
50552 APInt UndefElts;
50553 SmallVector<APInt, 32> EltBits;
50554 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50555 /*AllowWholeUndefs*/ true,
50556 /*AllowPartialUndefs*/ false)) {
50557 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50558 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50559 EltBits[0].getZExtValue(), DAG);
50560 }
50561
50562 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50563 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50564 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50565 return SDValue(N, 0);
50566
50567 return SDValue();
50568}
50569
50572 const X86Subtarget &Subtarget) {
50573 unsigned Opcode = N->getOpcode();
50574 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50575 X86ISD::VSRLI == Opcode) &&
50576 "Unexpected shift opcode");
50577 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50578 EVT VT = N->getValueType(0);
50579 SDValue N0 = N->getOperand(0);
50580 SDValue N1 = N->getOperand(1);
50581 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50582 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50583 "Unexpected value type");
50584 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50585
50586 // (shift undef, X) -> 0
50587 if (N0.isUndef())
50588 return DAG.getConstant(0, SDLoc(N), VT);
50589
50590 // Out of range logical bit shifts are guaranteed to be zero.
50591 // Out of range arithmetic bit shifts splat the sign bit.
50592 unsigned ShiftVal = N->getConstantOperandVal(1);
50593 if (ShiftVal >= NumBitsPerElt) {
50594 if (LogicalShift)
50595 return DAG.getConstant(0, SDLoc(N), VT);
50596 ShiftVal = NumBitsPerElt - 1;
50597 }
50598
50599 // (shift X, 0) -> X
50600 if (!ShiftVal)
50601 return N0;
50602
50603 // (shift 0, C) -> 0
50605 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50606 // result are all zeros, not undef.
50607 return DAG.getConstant(0, SDLoc(N), VT);
50608
50609 // (VSRAI -1, C) -> -1
50610 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50611 // N0 is all ones or undef. We guarantee that the bits shifted into the
50612 // result are all ones, not undef.
50613 return DAG.getAllOnesConstant(SDLoc(N), VT);
50614
50615 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50616 unsigned NewShiftVal = Amt0 + Amt1;
50617 if (NewShiftVal >= NumBitsPerElt) {
50618 // Out of range logical bit shifts are guaranteed to be zero.
50619 // Out of range arithmetic bit shifts splat the sign bit.
50620 if (LogicalShift)
50621 return DAG.getConstant(0, SDLoc(N), VT);
50622 NewShiftVal = NumBitsPerElt - 1;
50623 }
50624 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50625 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50626 };
50627
50628 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50629 if (Opcode == N0.getOpcode())
50630 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50631
50632 // (shl (add X, X), C) -> (shl X, (C + 1))
50633 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50634 N0.getOperand(0) == N0.getOperand(1))
50635 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50636
50637 // We can decode 'whole byte' logical bit shifts as shuffles.
50638 if (LogicalShift && (ShiftVal % 8) == 0) {
50639 SDValue Op(N, 0);
50640 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50641 return Res;
50642 }
50643
50644 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50645 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50646 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50647 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50648 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50649 N0.getOpcode() == X86ISD::PSHUFD &&
50650 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50651 N0->hasOneUse()) {
50653 if (BC.getOpcode() == X86ISD::VSHLI &&
50654 BC.getScalarValueSizeInBits() == 64 &&
50655 BC.getConstantOperandVal(1) == 63) {
50656 SDLoc DL(N);
50657 SDValue Src = BC.getOperand(0);
50658 Src = DAG.getBitcast(VT, Src);
50659 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50660 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50661 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50662 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50663 return Src;
50664 }
50665 }
50666
50667 auto TryConstantFold = [&](SDValue V) {
50668 APInt UndefElts;
50669 SmallVector<APInt, 32> EltBits;
50670 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50671 /*AllowWholeUndefs*/ true,
50672 /*AllowPartialUndefs*/ true))
50673 return SDValue();
50674 assert(EltBits.size() == VT.getVectorNumElements() &&
50675 "Unexpected shift value type");
50676 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50677 // created an undef input due to no input bits being demanded, but user
50678 // still expects 0 in other bits.
50679 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50680 APInt &Elt = EltBits[i];
50681 if (UndefElts[i])
50682 Elt = 0;
50683 else if (X86ISD::VSHLI == Opcode)
50684 Elt <<= ShiftVal;
50685 else if (X86ISD::VSRAI == Opcode)
50686 Elt.ashrInPlace(ShiftVal);
50687 else
50688 Elt.lshrInPlace(ShiftVal);
50689 }
50690 // Reset undef elements since they were zeroed above.
50691 UndefElts = 0;
50692 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50693 };
50694
50695 // Constant Folding.
50696 if (N->isOnlyUserOf(N0.getNode())) {
50697 if (SDValue C = TryConstantFold(N0))
50698 return C;
50699
50700 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50701 // Don't break NOT patterns.
50703 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50704 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50706 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50707 SDLoc DL(N);
50708 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50709 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50710 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50711 }
50712 }
50713 }
50714
50715 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50716 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50717 DCI))
50718 return SDValue(N, 0);
50719
50720 return SDValue();
50721}
50722
50725 const X86Subtarget &Subtarget) {
50726 EVT VT = N->getValueType(0);
50727 unsigned Opcode = N->getOpcode();
50728 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50729 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50730 Opcode == ISD::INSERT_VECTOR_ELT) &&
50731 "Unexpected vector insertion");
50732
50733 SDValue Vec = N->getOperand(0);
50734 SDValue Scl = N->getOperand(1);
50735 SDValue Idx = N->getOperand(2);
50736
50737 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50738 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50739 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50740
50741 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50742 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50743 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50744 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50745 APInt::getAllOnes(NumBitsPerElt), DCI))
50746 return SDValue(N, 0);
50747 }
50748
50749 // Attempt to combine insertion patterns to a shuffle.
50750 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50751 SDValue Op(N, 0);
50752 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50753 return Res;
50754 }
50755
50756 return SDValue();
50757}
50758
50759/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50760/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50761/// OR -> CMPNEQSS.
50764 const X86Subtarget &Subtarget) {
50765 unsigned opcode;
50766
50767 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50768 // we're requiring SSE2 for both.
50769 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50770 SDValue N0 = N->getOperand(0);
50771 SDValue N1 = N->getOperand(1);
50772 SDValue CMP0 = N0.getOperand(1);
50773 SDValue CMP1 = N1.getOperand(1);
50774 SDLoc DL(N);
50775
50776 // The SETCCs should both refer to the same CMP.
50777 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50778 return SDValue();
50779
50780 SDValue CMP00 = CMP0->getOperand(0);
50781 SDValue CMP01 = CMP0->getOperand(1);
50782 EVT VT = CMP00.getValueType();
50783
50784 if (VT == MVT::f32 || VT == MVT::f64 ||
50785 (VT == MVT::f16 && Subtarget.hasFP16())) {
50786 bool ExpectingFlags = false;
50787 // Check for any users that want flags:
50788 for (const SDNode *U : N->users()) {
50789 if (ExpectingFlags)
50790 break;
50791
50792 switch (U->getOpcode()) {
50793 default:
50794 case ISD::BR_CC:
50795 case ISD::BRCOND:
50796 case ISD::SELECT:
50797 ExpectingFlags = true;
50798 break;
50799 case ISD::CopyToReg:
50800 case ISD::SIGN_EXTEND:
50801 case ISD::ZERO_EXTEND:
50802 case ISD::ANY_EXTEND:
50803 break;
50804 }
50805 }
50806
50807 if (!ExpectingFlags) {
50808 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50809 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50810
50811 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50812 X86::CondCode tmp = cc0;
50813 cc0 = cc1;
50814 cc1 = tmp;
50815 }
50816
50817 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50818 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50819 // FIXME: need symbolic constants for these magic numbers.
50820 // See X86ATTInstPrinter.cpp:printSSECC().
50821 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50822 if (Subtarget.hasAVX512()) {
50823 SDValue FSetCC =
50824 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50825 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50826 // Need to fill with zeros to ensure the bitcast will produce zeroes
50827 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50828 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50829 DAG.getConstant(0, DL, MVT::v16i1),
50830 FSetCC, DAG.getVectorIdxConstant(0, DL));
50831 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50832 N->getSimpleValueType(0));
50833 }
50834 SDValue OnesOrZeroesF =
50835 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50836 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50837
50838 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50839 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50840
50841 if (is64BitFP && !Subtarget.is64Bit()) {
50842 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50843 // 64-bit integer, since that's not a legal type. Since
50844 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50845 // bits, but can do this little dance to extract the lowest 32 bits
50846 // and work with those going forward.
50847 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50848 MVT::v2f64, OnesOrZeroesF);
50849 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50850 OnesOrZeroesF =
50851 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50852 DAG.getVectorIdxConstant(0, DL));
50853 IntVT = MVT::i32;
50854 }
50855
50856 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50857 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50858 DAG.getConstant(1, DL, IntVT));
50859 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50860 ANDed);
50861 return OneBitOfTruth;
50862 }
50863 }
50864 }
50865 }
50866 return SDValue();
50867}
50868
50869/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50871 SelectionDAG &DAG) {
50872 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50873
50874 MVT VT = N->getSimpleValueType(0);
50875 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50876 return SDValue();
50877
50878 SDValue X, Y;
50879 SDValue N0 = N->getOperand(0);
50880 SDValue N1 = N->getOperand(1);
50881
50882 if (SDValue Not = IsNOT(N0, DAG)) {
50883 X = Not;
50884 Y = N1;
50885 } else if (SDValue Not = IsNOT(N1, DAG)) {
50886 X = Not;
50887 Y = N0;
50888 } else
50889 return SDValue();
50890
50891 X = DAG.getBitcast(VT, X);
50892 Y = DAG.getBitcast(VT, Y);
50893 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50894}
50895
50896/// Try to fold:
50897/// and (vector_shuffle<Z,...,Z>
50898/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50899/// ->
50900/// andnp (vector_shuffle<Z,...,Z>
50901/// (insert_vector_elt undef, X, Z), undef), Y
50903 const X86Subtarget &Subtarget) {
50904 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50905
50906 EVT VT = N->getValueType(0);
50907 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50908 // value and require extra moves.
50909 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50910 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50911 return SDValue();
50912
50913 auto GetNot = [&DAG](SDValue V) {
50915 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50916 // end-users are ISD::AND including cases
50917 // (and(extract_vector_element(SVN), Y)).
50918 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50919 !SVN->getOperand(1).isUndef()) {
50920 return SDValue();
50921 }
50922 SDValue IVEN = SVN->getOperand(0);
50923 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50924 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50925 return SDValue();
50926 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50927 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50928 return SDValue();
50929 SDValue Src = IVEN.getOperand(1);
50930 if (SDValue Not = IsNOT(Src, DAG)) {
50931 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50932 SDValue NotIVEN =
50934 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50935 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50936 SVN->getOperand(1), SVN->getMask());
50937 }
50938 return SDValue();
50939 };
50940
50941 SDValue X, Y;
50942 SDValue N0 = N->getOperand(0);
50943 SDValue N1 = N->getOperand(1);
50944 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50945
50946 if (SDValue Not = GetNot(N0)) {
50947 X = Not;
50948 Y = N1;
50949 } else if (SDValue Not = GetNot(N1)) {
50950 X = Not;
50951 Y = N0;
50952 } else
50953 return SDValue();
50954
50955 X = DAG.getBitcast(VT, X);
50956 Y = DAG.getBitcast(VT, Y);
50957 SDLoc DL(N);
50958
50959 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50960 // AVX2.
50961 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50963 SDValue LoX, HiX;
50964 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50965 SDValue LoY, HiY;
50966 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50967 EVT SplitVT = LoX.getValueType();
50968 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50969 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50970 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50971 }
50972
50973 if (TLI.isTypeLegal(VT))
50974 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50975
50976 return SDValue();
50977}
50978
50979// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50980// logical operations, like in the example below.
50981// or (and (truncate x, truncate y)),
50982// (xor (truncate z, build_vector (constants)))
50983// Given a target type \p VT, we generate
50984// or (and x, y), (xor z, zext(build_vector (constants)))
50985// given x, y and z are of type \p VT. We can do so, if operands are either
50986// truncates from VT types, the second operand is a vector of constants, can
50987// be recursively promoted or is an existing extension we can extend further.
50989 SelectionDAG &DAG,
50990 const X86Subtarget &Subtarget,
50991 unsigned Depth) {
50992 // Limit recursion to avoid excessive compile times.
50994 return SDValue();
50995
50996 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50997 return SDValue();
50998
50999 SDValue N0 = N.getOperand(0);
51000 SDValue N1 = N.getOperand(1);
51001
51002 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51003 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51004 return SDValue();
51005
51006 if (SDValue NN0 =
51007 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51008 N0 = NN0;
51009 else {
51010 // The left side has to be a 'trunc'.
51011 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51012 N0.getOperand(0).getValueType() == VT;
51013 if (LHSTrunc)
51014 N0 = N0.getOperand(0);
51015 else
51016 return SDValue();
51017 }
51018
51019 if (SDValue NN1 =
51020 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51021 N1 = NN1;
51022 else {
51023 // The right side has to be a 'trunc', a (foldable) constant or an
51024 // existing extension we can extend further.
51025 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51026 N1.getOperand(0).getValueType() == VT;
51027 if (RHSTrunc)
51028 N1 = N1.getOperand(0);
51029 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51030 Subtarget.hasInt256() && N1.hasOneUse())
51031 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51032 else if (SDValue Cst =
51034 N1 = Cst;
51035 else
51036 return SDValue();
51037 }
51038
51039 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51040}
51041
51042// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51043// register. In most cases we actually compare or select YMM-sized registers
51044// and mixing the two types creates horrible code. This method optimizes
51045// some of the transition sequences.
51046// Even with AVX-512 this is still useful for removing casts around logical
51047// operations on vXi1 mask types.
51049 SelectionDAG &DAG,
51050 const X86Subtarget &Subtarget) {
51051 EVT VT = N.getValueType();
51052 assert(VT.isVector() && "Expected vector type");
51053 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51054 N.getOpcode() == ISD::ZERO_EXTEND ||
51055 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51056
51057 SDValue Narrow = N.getOperand(0);
51058 EVT NarrowVT = Narrow.getValueType();
51059
51060 // Generate the wide operation.
51061 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51062 if (!Op)
51063 return SDValue();
51064 switch (N.getOpcode()) {
51065 default: llvm_unreachable("Unexpected opcode");
51066 case ISD::ANY_EXTEND:
51067 return Op;
51068 case ISD::ZERO_EXTEND:
51069 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51070 case ISD::SIGN_EXTEND:
51071 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51072 Op, DAG.getValueType(NarrowVT));
51073 }
51074}
51075
51076static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51077 unsigned FPOpcode;
51078 switch (Opcode) {
51079 // clang-format off
51080 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51081 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51082 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51083 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51084 // clang-format on
51085 }
51086 return FPOpcode;
51087}
51088
51089/// If both input operands of a logic op are being cast from floating-point
51090/// types or FP compares, try to convert this into a floating-point logic node
51091/// to avoid unnecessary moves from SSE to integer registers.
51092static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51093 SDValue N0, SDValue N1,
51094 SelectionDAG &DAG,
51096 const X86Subtarget &Subtarget) {
51097 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51098 "Unexpected bit opcode");
51099
51100 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51101 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51102 return SDValue();
51103
51104 SDValue N00 = N0.getOperand(0);
51105 SDValue N10 = N1.getOperand(0);
51106 EVT N00Type = N00.getValueType();
51107 EVT N10Type = N10.getValueType();
51108
51109 // Ensure that both types are the same and are legal scalar fp types.
51110 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51111 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51112 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51113 return SDValue();
51114
51115 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51116 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51117 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51118 return DAG.getBitcast(VT, FPLogic);
51119 }
51120
51121 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51122 !N1.hasOneUse())
51123 return SDValue();
51124
51125 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51126 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51127
51128 // The vector ISA for FP predicates is incomplete before AVX, so converting
51129 // COMIS* to CMPS* may not be a win before AVX.
51130 if (!Subtarget.hasAVX() &&
51131 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51132 return SDValue();
51133
51134 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51135 // and vector logic:
51136 // logic (setcc N00, N01), (setcc N10, N11) -->
51137 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51138 unsigned NumElts = 128 / N00Type.getSizeInBits();
51139 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51140 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51141 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51142 SDValue N01 = N0.getOperand(1);
51143 SDValue N11 = N1.getOperand(1);
51144 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51145 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51146 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51147 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51148 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51149 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51150 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51151 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51152}
51153
51154// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51155// to reduce XMM->GPR traffic.
51156static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51157 SDValue N1, SelectionDAG &DAG) {
51158 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51159 "Unexpected bit opcode");
51160
51161 // Both operands must be single use MOVMSK.
51162 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51163 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51164 return SDValue();
51165
51166 SDValue Vec0 = N0.getOperand(0);
51167 SDValue Vec1 = N1.getOperand(0);
51168 EVT VecVT0 = Vec0.getValueType();
51169 EVT VecVT1 = Vec1.getValueType();
51170
51171 // Both MOVMSK operands must be from vectors of the same size and same element
51172 // size, but its OK for a fp/int diff.
51173 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51174 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51175 return SDValue();
51176
51177 unsigned VecOpc =
51179 SDValue Result =
51180 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51181 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51182}
51183
51184// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51185// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51186// handles in InstCombine.
51187static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51188 SDValue N0, SDValue N1,
51189 SelectionDAG &DAG) {
51190 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51191 "Unexpected bit opcode");
51192
51193 // Both operands must be single use.
51194 if (!N0.hasOneUse() || !N1.hasOneUse())
51195 return SDValue();
51196
51197 // Search for matching shifts.
51200
51201 unsigned BCOpc = BC0.getOpcode();
51202 EVT BCVT = BC0.getValueType();
51203 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51204 return SDValue();
51205
51206 switch (BCOpc) {
51207 case X86ISD::VSHLI:
51208 case X86ISD::VSRLI:
51209 case X86ISD::VSRAI: {
51210 if (BC0.getOperand(1) != BC1.getOperand(1))
51211 return SDValue();
51212 SDValue BitOp =
51213 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51214 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51215 return DAG.getBitcast(VT, Shift);
51216 }
51217 }
51218
51219 return SDValue();
51220}
51221
51222// Attempt to fold:
51223// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51224// TODO: Handle PACKUS handling.
51225static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51226 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51227 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51228 "Unexpected bit opcode");
51229
51230 // Both operands must be single use.
51231 if (!N0.hasOneUse() || !N1.hasOneUse())
51232 return SDValue();
51233
51234 // Search for matching packs.
51237
51238 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51239 return SDValue();
51240
51241 MVT DstVT = N0.getSimpleValueType();
51242 if (DstVT != N1.getSimpleValueType())
51243 return SDValue();
51244
51245 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51246 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51247
51248 // Limit to allsignbits packing.
51249 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51250 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51251 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51252 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51253 return SDValue();
51254
51255 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51256 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51257 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51258}
51259
51260/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51261/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51262/// with a shift-right to eliminate loading the vector constant mask value.
51264 SelectionDAG &DAG,
51265 const X86Subtarget &Subtarget) {
51266 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51267 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51268 EVT VT = Op0.getValueType();
51269 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51270 return SDValue();
51271
51272 // Try to convert an "is positive" signbit masking operation into arithmetic
51273 // shift and "andn". This saves a materialization of a -1 vector constant.
51274 // The "is negative" variant should be handled more generally because it only
51275 // requires "and" rather than "andn":
51276 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51277 //
51278 // This is limited to the original type to avoid producing even more bitcasts.
51279 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51280 // will be profitable.
51281 if (N->getValueType(0) == VT &&
51282 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51283 SDValue X, Y;
51284 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51285 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51286 X = Op1.getOperand(0);
51287 Y = Op0;
51288 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51289 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51290 X = Op0.getOperand(0);
51291 Y = Op1;
51292 }
51293 if (X && Y) {
51294 SDValue Sra =
51296 VT.getScalarSizeInBits() - 1, DAG);
51297 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51298 }
51299 }
51300
51301 APInt SplatVal;
51302 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51303 return SDValue();
51304
51305 // Don't prevent creation of ANDN.
51306 if (isBitwiseNot(Op0))
51307 return SDValue();
51308
51309 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51310 return SDValue();
51311
51312 unsigned EltBitWidth = VT.getScalarSizeInBits();
51313 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51314 return SDValue();
51315
51316 unsigned ShiftVal = SplatVal.countr_one();
51317 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51318 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51319 return DAG.getBitcast(N->getValueType(0), Shift);
51320}
51321
51322// Get the index node from the lowered DAG of a GEP IR instruction with one
51323// indexing dimension.
51325 if (Ld->isIndexed())
51326 return SDValue();
51327
51328 SDValue Base = Ld->getBasePtr();
51329 if (Base.getOpcode() != ISD::ADD)
51330 return SDValue();
51331
51332 SDValue ShiftedIndex = Base.getOperand(0);
51333 if (ShiftedIndex.getOpcode() != ISD::SHL)
51334 return SDValue();
51335
51336 return ShiftedIndex.getOperand(0);
51337}
51338
51339static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51340 return Subtarget.hasBMI2() &&
51341 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51342}
51343
51344/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51345/// This undoes the inverse fold performed in InstCombine
51347 SelectionDAG &DAG) {
51348 using namespace llvm::SDPatternMatch;
51349 MVT VT = N->getSimpleValueType(0);
51350 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51351 return SDValue();
51352
51353 SDValue X, Y, Z;
51354 if (sd_match(N, m_And(m_Value(X),
51355 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51356 // Don't fold if Y or Z are constants to prevent infinite loops.
51359 return DAG.getNode(
51360 ISD::AND, DL, VT, X,
51361 DAG.getNOT(
51362 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51363 }
51364
51365 return SDValue();
51366}
51367
51368// This function recognizes cases where X86 bzhi instruction can replace and
51369// 'and-load' sequence.
51370// In case of loading integer value from an array of constants which is defined
51371// as follows:
51372//
51373// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51374//
51375// then applying a bitwise and on the result with another input.
51376// It's equivalent to performing bzhi (zero high bits) on the input, with the
51377// same index of the load.
51379 const X86Subtarget &Subtarget) {
51380 MVT VT = Node->getSimpleValueType(0);
51381 SDLoc dl(Node);
51382
51383 // Check if subtarget has BZHI instruction for the node's type
51384 if (!hasBZHI(Subtarget, VT))
51385 return SDValue();
51386
51387 // Try matching the pattern for both operands.
51388 for (unsigned i = 0; i < 2; i++) {
51389 // continue if the operand is not a load instruction
51390 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51391 if (!Ld)
51392 continue;
51393 const Value *MemOp = Ld->getMemOperand()->getValue();
51394 if (!MemOp)
51395 continue;
51396 // Get the Node which indexes into the array.
51398 if (!Index)
51399 continue;
51400
51401 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51402 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51403 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51404 Constant *Init = GV->getInitializer();
51405 Type *Ty = Init->getType();
51407 !Ty->getArrayElementType()->isIntegerTy() ||
51408 Ty->getArrayElementType()->getScalarSizeInBits() !=
51409 VT.getSizeInBits() ||
51410 Ty->getArrayNumElements() >
51411 Ty->getArrayElementType()->getScalarSizeInBits())
51412 continue;
51413
51414 // Check if the array's constant elements are suitable to our case.
51415 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51416 bool ConstantsMatch = true;
51417 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51418 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51419 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51420 ConstantsMatch = false;
51421 break;
51422 }
51423 }
51424 if (!ConstantsMatch)
51425 continue;
51426
51427 // Do the transformation (For 32-bit type):
51428 // -> (and (load arr[idx]), inp)
51429 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51430 // that will be replaced with one bzhi instruction.
51431 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51432 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51433
51434 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51435 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51436 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51437
51438 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51439 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51440 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51441 }
51442 }
51443 }
51444 }
51445 return SDValue();
51446}
51447
51448// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51449// Where C is a mask containing the same number of bits as the setcc and
51450// where the setcc will freely 0 upper bits of k-register. We can replace the
51451// undef in the concat with 0s and remove the AND. This mainly helps with
51452// v2i1/v4i1 setcc being casted to scalar.
51454 const X86Subtarget &Subtarget) {
51455 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51456
51457 EVT VT = N->getValueType(0);
51458
51459 // Make sure this is an AND with constant. We will check the value of the
51460 // constant later.
51461 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51462 if (!C1)
51463 return SDValue();
51464
51465 // This is implied by the ConstantSDNode.
51466 assert(!VT.isVector() && "Expected scalar VT!");
51467
51468 SDValue Src = N->getOperand(0);
51469 if (!Src.hasOneUse())
51470 return SDValue();
51471
51472 // (Optionally) peek through any_extend().
51473 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51474 if (!Src.getOperand(0).hasOneUse())
51475 return SDValue();
51476 Src = Src.getOperand(0);
51477 }
51478
51479 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51480 return SDValue();
51481
51482 Src = Src.getOperand(0);
51483 EVT SrcVT = Src.getValueType();
51484
51485 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51486 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51487 !TLI.isTypeLegal(SrcVT))
51488 return SDValue();
51489
51490 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51491 return SDValue();
51492
51493 // We only care about the first subvector of the concat, we expect the
51494 // other subvectors to be ignored due to the AND if we make the change.
51495 SDValue SubVec = Src.getOperand(0);
51496 EVT SubVecVT = SubVec.getValueType();
51497
51498 // The RHS of the AND should be a mask with as many bits as SubVec.
51499 if (!TLI.isTypeLegal(SubVecVT) ||
51500 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51501 return SDValue();
51502
51503 // First subvector should be a setcc with a legal result type or a
51504 // AND containing at least one setcc with a legal result type.
51505 auto IsLegalSetCC = [&](SDValue V) {
51506 if (V.getOpcode() != ISD::SETCC)
51507 return false;
51508 EVT SetccVT = V.getOperand(0).getValueType();
51509 if (!TLI.isTypeLegal(SetccVT) ||
51510 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51511 return false;
51512 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51513 return false;
51514 return true;
51515 };
51516 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51517 (IsLegalSetCC(SubVec.getOperand(0)) ||
51518 IsLegalSetCC(SubVec.getOperand(1))))))
51519 return SDValue();
51520
51521 // We passed all the checks. Rebuild the concat_vectors with zeroes
51522 // and cast it back to VT.
51523 SDLoc dl(N);
51524 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51525 DAG.getConstant(0, dl, SubVecVT));
51526 Ops[0] = SubVec;
51527 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51528 Ops);
51529 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51530 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51531}
51532
51534 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51535 // We don't want to go crazy with the recursion here. This isn't a super
51536 // important optimization.
51537 static constexpr unsigned kMaxDepth = 2;
51538
51539 // Only do this re-ordering if op has one use.
51540 if (!Op.hasOneUse())
51541 return SDValue();
51542
51543 SDLoc DL(Op);
51544 // If we hit another assosiative op, recurse further.
51545 if (Op.getOpcode() == Opc) {
51546 // Done recursing.
51547 if (Depth++ >= kMaxDepth)
51548 return SDValue();
51549
51550 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51551 if (SDValue R =
51552 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51553 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51554 Op.getOperand(1 - OpIdx));
51555
51556 } else if (Op.getOpcode() == ISD::SUB) {
51557 if (Opc == ISD::AND) {
51558 // BLSI: (and x, (sub 0, x))
51559 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51560 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51561 }
51562 // Opc must be ISD::AND or ISD::XOR
51563 // BLSR: (and x, (sub x, 1))
51564 // BLSMSK: (xor x, (sub x, 1))
51565 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51566 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51567
51568 } else if (Op.getOpcode() == ISD::ADD) {
51569 // Opc must be ISD::AND or ISD::XOR
51570 // BLSR: (and x, (add x, -1))
51571 // BLSMSK: (xor x, (add x, -1))
51572 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51573 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51574 }
51575 return SDValue();
51576}
51577
51579 const X86Subtarget &Subtarget) {
51580 EVT VT = N->getValueType(0);
51581 // Make sure this node is a candidate for BMI instructions.
51582 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51583 (VT != MVT::i32 && VT != MVT::i64))
51584 return SDValue();
51585
51586 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51587
51588 // Try and match LHS and RHS.
51589 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51590 if (SDValue OpMatch =
51591 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51592 N->getOperand(1 - OpIdx), 0))
51593 return OpMatch;
51594 return SDValue();
51595}
51596
51597/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51599 SelectionDAG &DAG,
51600 const X86Subtarget &Subtarget) {
51601 using namespace llvm::SDPatternMatch;
51602
51603 EVT VT = And->getValueType(0);
51604 // Make sure this node is a candidate for BMI instructions.
51605 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51606 return SDValue();
51607
51608 SDValue X;
51609 SDValue Y;
51612 m_Value(Y))))
51613 return SDValue();
51614
51615 SDValue BLSMSK =
51616 DAG.getNode(ISD::XOR, DL, VT, X,
51617 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51618 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51619 return AndN;
51620}
51621
51623 SelectionDAG &DAG,
51625 const X86Subtarget &ST) {
51626 // cmp(setcc(cc, X), 0)
51627 // brcond ne
51628 // ->
51629 // X
51630 // brcond cc
51631
51632 // sub(setcc(cc, X), 1)
51633 // brcond ne
51634 // ->
51635 // X
51636 // brcond ~cc
51637 //
51638 // if only flag has users
51639
51640 SDValue SetCC = N->getOperand(0);
51641
51642 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51643 return SDValue();
51644
51645 // Check the only user of flag is `brcond ne`.
51646 SDNode *BrCond = *Flag->user_begin();
51647 if (BrCond->getOpcode() != X86ISD::BRCOND)
51648 return SDValue();
51649 unsigned CondNo = 2;
51650 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51652 return SDValue();
51653
51654 SDValue X = SetCC.getOperand(1);
51655 // sub has two results while X only have one. DAG combine assumes the value
51656 // type matches.
51657 if (N->getOpcode() == X86ISD::SUB)
51658 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51659
51660 SDValue CCN = SetCC.getOperand(0);
51661 X86::CondCode CC =
51662 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51664 // Update CC for the consumer of the flag.
51665 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51666 // checking if the second condition evaluates to true. When comparing the
51667 // result with 1, we are checking uf the second condition evaluates to false.
51669 if (isNullConstant(N->getOperand(1)))
51670 Ops[CondNo] = CCN;
51671 else if (isOneConstant(N->getOperand(1)))
51672 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51673 else
51674 llvm_unreachable("expect constant 0 or 1");
51675
51676 SDValue NewBrCond =
51677 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51678 // Avoid self-assign error b/c CC1 can be `e/ne`.
51679 if (BrCond != NewBrCond.getNode())
51680 DCI.CombineTo(BrCond, NewBrCond);
51681 return X;
51682}
51683
51686 const X86Subtarget &ST) {
51687 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51688 // ->
51689 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51690
51691 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51692 // ->
51693 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51694 //
51695 // where cflags is determined by cc1.
51696
51697 if (!ST.hasCCMP())
51698 return SDValue();
51699
51700 SDValue SetCC0 = N->getOperand(0);
51701 SDValue SetCC1 = N->getOperand(1);
51702 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51703 SetCC1.getOpcode() != X86ISD::SETCC)
51704 return SDValue();
51705
51706 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51707 SDValue Op = V.getOperand(1);
51708 unsigned Opc = Op.getOpcode();
51709 if (Opc == X86ISD::SUB)
51710 return X86ISD::CCMP;
51711 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51712 return X86ISD::CTEST;
51713 return 0U;
51714 };
51715
51716 unsigned NewOpc = 0;
51717
51718 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51719 // appear on the right.
51720 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51721 std::swap(SetCC0, SetCC1);
51722 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51723 return SDValue();
51724 }
51725
51726 X86::CondCode CC0 =
51727 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51728 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51729 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51730 return SDValue();
51731
51732 bool IsOR = N->getOpcode() == ISD::OR;
51733
51734 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51735 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51736 // operator is OR. Similar for CC1.
51737 SDValue SrcCC =
51739 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51740 : SetCC0.getOperand(0);
51741 SDValue CC1N = SetCC1.getOperand(0);
51742 X86::CondCode CC1 =
51743 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51745 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51746 SDLoc DL(N);
51747 SDValue CFlags = DAG.getTargetConstant(
51748 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51749 SDValue Sub = SetCC1.getOperand(1);
51750
51751 // Replace any uses of the old flag produced by SUB/CMP with the new one
51752 // produced by CCMP/CTEST.
51753 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51754 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51755 {Sub.getOperand(0), Sub.getOperand(1),
51756 CFlags, SrcCC, SetCC0.getOperand(1)})
51757 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51758 {Sub.getOperand(0), Sub.getOperand(0),
51759 CFlags, SrcCC, SetCC0.getOperand(1)});
51760
51761 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51762}
51763
51766 const X86Subtarget &Subtarget) {
51767 using namespace SDPatternMatch;
51768
51769 SDValue N0 = N->getOperand(0);
51770 SDValue N1 = N->getOperand(1);
51771 EVT VT = N->getValueType(0);
51772 SDLoc dl(N);
51773 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51774
51775 // If this is SSE1 only convert to FAND to avoid scalarization.
51776 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51777 return DAG.getBitcast(MVT::v4i32,
51778 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51779 DAG.getBitcast(MVT::v4f32, N0),
51780 DAG.getBitcast(MVT::v4f32, N1)));
51781 }
51782
51783 // Use a 32-bit and+zext if upper bits known zero.
51784 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51785 APInt HiMask = APInt::getHighBitsSet(64, 32);
51786 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51787 DAG.MaskedValueIsZero(N0, HiMask)) {
51788 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51789 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51790 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51791 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51792 }
51793 }
51794
51795 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51796 // TODO: Support multiple SrcOps.
51797 if (VT == MVT::i1) {
51799 SmallVector<APInt, 2> SrcPartials;
51800 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51801 SrcOps.size() == 1) {
51802 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51803 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51804 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51805 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51806 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51807 if (Mask) {
51808 assert(SrcPartials[0].getBitWidth() == NumElts &&
51809 "Unexpected partial reduction mask");
51810 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51811 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51812 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51813 }
51814 }
51815 }
51816
51817 // InstCombine converts:
51818 // `(-x << C0) & C1`
51819 // to
51820 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51821 // This saves an IR instruction but on x86 the neg/shift version is preferable
51822 // so undo the transform.
51823
51824 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51825 // TODO: We don't actually need a splat for this, we just need the checks to
51826 // hold for each element.
51827 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51828 /*AllowTruncation*/ false);
51829 ConstantSDNode *N01C =
51830 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51831 /*AllowTruncation*/ false);
51832 if (N1C && N01C) {
51833 const APInt &MulC = N01C->getAPIntValue();
51834 const APInt &AndC = N1C->getAPIntValue();
51835 APInt MulCLowBit = MulC & (-MulC);
51836 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51837 (MulCLowBit + MulC).isPowerOf2()) {
51838 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51839 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51840 assert(MulCLowBitLog != -1 &&
51841 "Isolated lowbit is somehow not a power of 2!");
51842 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51843 DAG.getConstant(MulCLowBitLog, dl, VT));
51844 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51845 }
51846 }
51847 }
51848
51849 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51850 return SetCC;
51851
51852 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51853 return V;
51854
51855 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51856 return R;
51857
51858 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51859 return R;
51860
51861 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51862 return R;
51863
51864 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51865 DAG, DCI, Subtarget))
51866 return FPLogic;
51867
51868 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51869 return R;
51870
51871 if (DCI.isBeforeLegalizeOps())
51872 return SDValue();
51873
51874 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51875 return R;
51876
51877 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51878 return R;
51879
51880 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51881 return ShiftRight;
51882
51883 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51884 return R;
51885
51886 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51887 return R;
51888
51889 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51890 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51891 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51892 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51893 unsigned Opc0 = N0.getOpcode();
51894 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51896 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51897 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51898 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51899 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51900 }
51901 }
51902
51903 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51904 // to make use of predicated selects.
51905 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51906 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51907 SDValue X, Y;
51908 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51909 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51910 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51911 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51914 m_Value(Y), m_SpecificVT(CondVT),
51915 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51916 return DAG.getSelect(dl, VT, Y, X,
51917 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51918 }
51919 }
51920
51921 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51922 // avoids slow variable shift (moving shift amount to ECX etc.)
51923 if (isOneConstant(N1) && N0->hasOneUse()) {
51924 SDValue Src = N0;
51925 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51926 Src.getOpcode() == ISD::TRUNCATE) &&
51927 Src.getOperand(0)->hasOneUse())
51928 Src = Src.getOperand(0);
51929 bool ContainsNOT = false;
51930 X86::CondCode X86CC = X86::COND_B;
51931 // Peek through AND(NOT(SRL(X,Y)),1).
51932 if (isBitwiseNot(Src)) {
51933 Src = Src.getOperand(0);
51934 X86CC = X86::COND_AE;
51935 ContainsNOT = true;
51936 }
51937 if (Src.getOpcode() == ISD::SRL &&
51938 !isa<ConstantSDNode>(Src.getOperand(1))) {
51939 SDValue BitNo = Src.getOperand(1);
51940 Src = Src.getOperand(0);
51941 // Peek through AND(SRL(NOT(X),Y),1).
51942 if (isBitwiseNot(Src)) {
51943 Src = Src.getOperand(0);
51944 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51945 ContainsNOT = true;
51946 }
51947 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51948 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51949 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51950 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51951 }
51952 }
51953
51954 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51955 // Attempt to recursively combine a bitmask AND with shuffles.
51956 SDValue Op(N, 0);
51957 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51958 return Res;
51959
51960 // If either operand is a constant mask, then only the elements that aren't
51961 // zero are actually demanded by the other operand.
51962 auto GetDemandedMasks = [&](SDValue Op) {
51963 APInt UndefElts;
51964 SmallVector<APInt> EltBits;
51965 int NumElts = VT.getVectorNumElements();
51966 int EltSizeInBits = VT.getScalarSizeInBits();
51967 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51968 APInt DemandedElts = APInt::getAllOnes(NumElts);
51969 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51970 EltBits)) {
51971 DemandedBits.clearAllBits();
51972 DemandedElts.clearAllBits();
51973 for (int I = 0; I != NumElts; ++I) {
51974 if (UndefElts[I]) {
51975 // We can't assume an undef src element gives an undef dst - the
51976 // other src might be zero.
51977 DemandedBits.setAllBits();
51978 DemandedElts.setBit(I);
51979 } else if (!EltBits[I].isZero()) {
51980 DemandedBits |= EltBits[I];
51981 DemandedElts.setBit(I);
51982 }
51983 }
51984 }
51985 return std::make_pair(DemandedBits, DemandedElts);
51986 };
51987 APInt Bits0, Elts0;
51988 APInt Bits1, Elts1;
51989 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51990 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51991
51992 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51993 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51994 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51995 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51996 if (N->getOpcode() != ISD::DELETED_NODE)
51997 DCI.AddToWorklist(N);
51998 return SDValue(N, 0);
51999 }
52000
52001 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52002 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52003 if (NewN0 || NewN1)
52004 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52005 NewN1 ? NewN1 : N1);
52006 }
52007
52008 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52009 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52011 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52012 SDValue BitMask = N1;
52013 SDValue SrcVec = N0.getOperand(0);
52014 EVT SrcVecVT = SrcVec.getValueType();
52015
52016 // Check that the constant bitmask masks whole bytes.
52017 APInt UndefElts;
52018 SmallVector<APInt, 64> EltBits;
52019 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52020 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52021 llvm::all_of(EltBits, [](const APInt &M) {
52022 return M.isZero() || M.isAllOnes();
52023 })) {
52024 unsigned NumElts = SrcVecVT.getVectorNumElements();
52025 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52026 unsigned Idx = N0.getConstantOperandVal(1);
52027
52028 // Create a root shuffle mask from the byte mask and the extracted index.
52029 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52030 for (unsigned i = 0; i != Scale; ++i) {
52031 if (UndefElts[i])
52032 continue;
52033 int VecIdx = Scale * Idx + i;
52034 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52035 }
52036
52038 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52039 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52040 /*AllowVariableCrossLaneMask=*/true,
52041 /*AllowVariablePerLaneMask=*/true,
52042 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52043 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52044 N0.getOperand(1));
52045 }
52046 }
52047
52048 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52049 return R;
52050
52051 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52052 return R;
52053
52054 return SDValue();
52055}
52056
52057// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52059 SelectionDAG &DAG,
52060 const X86Subtarget &Subtarget) {
52061 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52062
52063 MVT VT = N->getSimpleValueType(0);
52064 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52065 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52066 return SDValue();
52067
52068 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52069 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52070 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52071 return SDValue();
52072
52073 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52074 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52075 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52076 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52077 return SDValue();
52078
52079 // Attempt to extract constant byte masks.
52080 APInt UndefElts0, UndefElts1;
52081 SmallVector<APInt, 32> EltBits0, EltBits1;
52082 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52083 /*AllowWholeUndefs*/ false,
52084 /*AllowPartialUndefs*/ false))
52085 return SDValue();
52086 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52087 /*AllowWholeUndefs*/ false,
52088 /*AllowPartialUndefs*/ false))
52089 return SDValue();
52090
52091 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52092 // TODO - add UNDEF elts support.
52093 if (UndefElts0[i] || UndefElts1[i])
52094 return SDValue();
52095 if (EltBits0[i] != ~EltBits1[i])
52096 return SDValue();
52097 }
52098
52099 if (useVPTERNLOG(Subtarget, VT)) {
52100 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52101 // VPTERNLOG is only available as vXi32/64-bit types.
52102 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52103 MVT OpVT =
52104 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52105 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52106 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52107 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52108 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52109 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52110 DAG, Subtarget);
52111 return DAG.getBitcast(VT, Res);
52112 }
52113
52114 SDValue X = N->getOperand(0);
52115 SDValue Y =
52116 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52117 DAG.getBitcast(VT, N1.getOperand(0)));
52118 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52119}
52120
52121// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52122// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52123// Waiting for ANDNP combine allows other combines to happen that prevent
52124// matching.
52125static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52126 using namespace SDPatternMatch;
52127 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52128 m_And(m_Deferred(Mask), m_Value(Y))));
52129}
52130
52131// Try to fold:
52132// (or (and (m, y), (pandn m, x)))
52133// into:
52134// (vselect m, x, y)
52135// As a special case, try to fold:
52136// (or (and (m, (sub 0, x)), (pandn m, x)))
52137// into:
52138// (sub (xor X, M), M)
52140 SelectionDAG &DAG,
52141 const X86Subtarget &Subtarget) {
52142 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52143
52144 EVT VT = N->getValueType(0);
52145 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52146 (VT.is256BitVector() && Subtarget.hasInt256())))
52147 return SDValue();
52148
52149 SDValue X, Y, Mask;
52150 if (!matchLogicBlend(N, X, Y, Mask))
52151 return SDValue();
52152
52153 // Validate that X, Y, and Mask are bitcasts, and see through them.
52154 Mask = peekThroughBitcasts(Mask);
52157
52158 EVT MaskVT = Mask.getValueType();
52159 unsigned EltBits = MaskVT.getScalarSizeInBits();
52160
52161 // TODO: Attempt to handle floating point cases as well?
52162 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52163 return SDValue();
52164
52165 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52166 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52167 DAG, Subtarget))
52168 return Res;
52169
52170 // PBLENDVB is only available on SSE 4.1.
52171 if (!Subtarget.hasSSE41())
52172 return SDValue();
52173
52174 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52175 if (Subtarget.hasVLX())
52176 return SDValue();
52177
52178 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52179
52180 X = DAG.getBitcast(BlendVT, X);
52181 Y = DAG.getBitcast(BlendVT, Y);
52182 Mask = DAG.getBitcast(BlendVT, Mask);
52183 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52184 return DAG.getBitcast(VT, Mask);
52185}
52186
52187// Helper function for combineOrCmpEqZeroToCtlzSrl
52188// Transforms:
52189// seteq(cmp x, 0)
52190// into:
52191// srl(ctlz x), log2(bitsize(x))
52192// Input pattern is checked by caller.
52194 SDValue Cmp = Op.getOperand(1);
52195 EVT VT = Cmp.getOperand(0).getValueType();
52196 unsigned Log2b = Log2_32(VT.getSizeInBits());
52197 SDLoc dl(Op);
52198 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52199 // The result of the shift is true or false, and on X86, the 32-bit
52200 // encoding of shr and lzcnt is more desirable.
52201 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52202 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52203 DAG.getConstant(Log2b, dl, MVT::i8));
52204 return Scc;
52205}
52206
52207// Try to transform:
52208// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52209// into:
52210// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52211// Will also attempt to match more generic cases, eg:
52212// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52213// Only applies if the target supports the FastLZCNT feature.
52216 const X86Subtarget &Subtarget) {
52217 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52218 return SDValue();
52219
52220 auto isORCandidate = [](SDValue N) {
52221 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52222 };
52223
52224 // Check the zero extend is extending to 32-bit or more. The code generated by
52225 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52226 // instructions to clear the upper bits.
52227 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52228 !isORCandidate(N->getOperand(0)))
52229 return SDValue();
52230
52231 // Check the node matches: setcc(eq, cmp 0)
52232 auto isSetCCCandidate = [](SDValue N) {
52233 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52234 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52235 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52236 isNullConstant(N->getOperand(1).getOperand(1)) &&
52237 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52238 };
52239
52240 SDNode *OR = N->getOperand(0).getNode();
52241 SDValue LHS = OR->getOperand(0);
52242 SDValue RHS = OR->getOperand(1);
52243
52244 // Save nodes matching or(or, setcc(eq, cmp 0)).
52246 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52247 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52248 ORNodes.push_back(OR);
52249 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52250 LHS = OR->getOperand(0);
52251 RHS = OR->getOperand(1);
52252 }
52253
52254 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52255 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52256 !isORCandidate(SDValue(OR, 0)))
52257 return SDValue();
52258
52259 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52260 // to
52261 // or(srl(ctlz),srl(ctlz)).
52262 // The dag combiner can then fold it into:
52263 // srl(or(ctlz, ctlz)).
52264 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52265 SDValue Ret, NewRHS;
52266 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52267 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52268
52269 if (!Ret)
52270 return SDValue();
52271
52272 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52273 while (!ORNodes.empty()) {
52274 OR = ORNodes.pop_back_val();
52275 LHS = OR->getOperand(0);
52276 RHS = OR->getOperand(1);
52277 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52278 if (RHS->getOpcode() == ISD::OR)
52279 std::swap(LHS, RHS);
52280 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52281 if (!NewRHS)
52282 return SDValue();
52283 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52284 }
52285
52286 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52287}
52288
52289/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52290/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52291/// with CMP+{ADC, SBB}.
52292/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52293static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52294 SDValue X, SDValue Y,
52295 SelectionDAG &DAG,
52296 bool ZeroSecondOpOnly = false) {
52297 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52298 return SDValue();
52299
52300 // Look through a one-use zext.
52301 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52302 Y = Y.getOperand(0);
52303
52304 X86::CondCode CC;
52305 SDValue EFLAGS;
52306 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52307 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52308 EFLAGS = Y.getOperand(1);
52309 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52310 Y.hasOneUse()) {
52311 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52312 }
52313
52314 if (!EFLAGS)
52315 return SDValue();
52316
52317 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52318 // the general case below.
52319 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52320 if (ConstantX && !ZeroSecondOpOnly) {
52321 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52322 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52323 // This is a complicated way to get -1 or 0 from the carry flag:
52324 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52325 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52326 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52327 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52328 EFLAGS);
52329 }
52330
52331 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52332 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52333 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52334 EFLAGS.getValueType().isInteger() &&
52335 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52336 // Swap the operands of a SUB, and we have the same pattern as above.
52337 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52338 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52339 SDValue NewSub = DAG.getNode(
52340 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52341 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52342 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52343 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52344 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52345 NewEFLAGS);
52346 }
52347 }
52348 }
52349
52350 if (CC == X86::COND_B) {
52351 // X + SETB Z --> adc X, 0
52352 // X - SETB Z --> sbb X, 0
52353 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52354 DAG.getVTList(VT, MVT::i32), X,
52355 DAG.getConstant(0, DL, VT), EFLAGS);
52356 }
52357
52358 if (ZeroSecondOpOnly)
52359 return SDValue();
52360
52361 if (CC == X86::COND_A) {
52362 // Try to convert COND_A into COND_B in an attempt to facilitate
52363 // materializing "setb reg".
52364 //
52365 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52366 // cannot take an immediate as its first operand.
52367 //
52368 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52369 EFLAGS.getValueType().isInteger() &&
52370 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52371 SDValue NewSub =
52372 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52373 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52374 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52375 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52376 DAG.getVTList(VT, MVT::i32), X,
52377 DAG.getConstant(0, DL, VT), NewEFLAGS);
52378 }
52379 }
52380
52381 if (CC == X86::COND_AE) {
52382 // X + SETAE --> sbb X, -1
52383 // X - SETAE --> adc X, -1
52384 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52385 DAG.getVTList(VT, MVT::i32), X,
52386 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52387 }
52388
52389 if (CC == X86::COND_BE) {
52390 // X + SETBE --> sbb X, -1
52391 // X - SETBE --> adc X, -1
52392 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52393 // materializing "setae reg".
52394 //
52395 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52396 // cannot take an immediate as its first operand.
52397 //
52398 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52399 EFLAGS.getValueType().isInteger() &&
52400 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52401 SDValue NewSub =
52402 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52403 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52404 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52405 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52406 DAG.getVTList(VT, MVT::i32), X,
52407 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52408 }
52409 }
52410
52411 if (CC != X86::COND_E && CC != X86::COND_NE)
52412 return SDValue();
52413
52414 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52415 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52416 !EFLAGS.getOperand(0).getValueType().isInteger())
52417 return SDValue();
52418
52419 SDValue Z = EFLAGS.getOperand(0);
52420 EVT ZVT = Z.getValueType();
52421
52422 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52423 // the general case below.
52424 if (ConstantX) {
52425 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52426 // fake operands:
52427 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52428 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52429 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52430 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52431 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52432 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52433 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52434 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52435 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52436 SDValue(Neg.getNode(), 1));
52437 }
52438
52439 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52440 // with fake operands:
52441 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52442 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52443 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52444 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52445 SDValue One = DAG.getConstant(1, DL, ZVT);
52446 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52447 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52448 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52449 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52450 Cmp1.getValue(1));
52451 }
52452 }
52453
52454 // (cmp Z, 1) sets the carry flag if Z is 0.
52455 SDValue One = DAG.getConstant(1, DL, ZVT);
52456 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52457 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52458
52459 // Add the flags type for ADC/SBB nodes.
52460 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52461
52462 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52463 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52464 if (CC == X86::COND_NE)
52465 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52466 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52467
52468 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52469 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52470 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52471 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52472}
52473
52474/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52475/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52476/// with CMP+{ADC, SBB}.
52478 SelectionDAG &DAG) {
52479 bool IsSub = N->getOpcode() == ISD::SUB;
52480 SDValue X = N->getOperand(0);
52481 SDValue Y = N->getOperand(1);
52482 EVT VT = N->getValueType(0);
52483
52484 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52485 return ADCOrSBB;
52486
52487 // Commute and try again (negate the result for subtracts).
52488 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52489 if (IsSub)
52490 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52491 return ADCOrSBB;
52492 }
52493
52494 return SDValue();
52495}
52496
52497static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52498 SDValue N0, SDValue N1,
52499 SelectionDAG &DAG) {
52500 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52501
52502 // Delegate to combineAddOrSubToADCOrSBB if we have:
52503 //
52504 // (xor/or (zero_extend (setcc)) imm)
52505 //
52506 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52507 // equivalent to a SUB/ADD, respectively.
52508 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52509 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52510 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52511 bool IsSub = Opc == ISD::XOR;
52512 bool N1COdd = N1C->getZExtValue() & 1;
52513 if (IsSub ? N1COdd : !N1COdd)
52514 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52515 return R;
52516 }
52517 }
52518
52519 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52520 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52521 N0.getOperand(0).getOpcode() == ISD::AND &&
52525 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52526 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52527 N0.getOperand(0).getOperand(1));
52528 }
52529
52530 return SDValue();
52531}
52532
52535 const X86Subtarget &Subtarget) {
52536 SDValue N0 = N->getOperand(0);
52537 SDValue N1 = N->getOperand(1);
52538 EVT VT = N->getValueType(0);
52539 SDLoc dl(N);
52540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52541
52542 // If this is SSE1 only convert to FOR to avoid scalarization.
52543 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52544 return DAG.getBitcast(MVT::v4i32,
52545 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52546 DAG.getBitcast(MVT::v4f32, N0),
52547 DAG.getBitcast(MVT::v4f32, N1)));
52548 }
52549
52550 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52551 // TODO: Support multiple SrcOps.
52552 if (VT == MVT::i1) {
52554 SmallVector<APInt, 2> SrcPartials;
52555 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52556 SrcOps.size() == 1) {
52557 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52558 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52559 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52560 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52561 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52562 if (Mask) {
52563 assert(SrcPartials[0].getBitWidth() == NumElts &&
52564 "Unexpected partial reduction mask");
52565 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52566 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52567 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52568 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52569 }
52570 }
52571 }
52572
52573 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52574 return SetCC;
52575
52576 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52577 return R;
52578
52579 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52580 return R;
52581
52582 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52583 return R;
52584
52585 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52586 DAG, DCI, Subtarget))
52587 return FPLogic;
52588
52589 if (DCI.isBeforeLegalizeOps())
52590 return SDValue();
52591
52592 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52593 return R;
52594
52595 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52596 return R;
52597
52598 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52599 return R;
52600
52601 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52602 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52603 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52604 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52605 uint64_t Val = CN->getZExtValue();
52606 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52607 Val == 8) {
52608 SDValue NotCond;
52609 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52610 N0.getOperand(1).hasOneUse()) {
52613 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52614 } else if (N0.getOpcode() == ISD::SUB &&
52615 isNullConstant(N0.getOperand(0))) {
52616 SDValue Cond = N0.getOperand(1);
52617 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52618 Cond = Cond.getOperand(0);
52619 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52620 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52622 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52623 }
52624 }
52625
52626 if (NotCond) {
52627 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52628 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52629 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52630 return R;
52631 }
52632 }
52633 }
52634 }
52635
52636 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52637 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52638 // iff the upper elements of the non-shifted arg are zero.
52639 // KUNPCK require 16+ bool vector elements.
52640 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52641 unsigned NumElts = VT.getVectorNumElements();
52642 unsigned HalfElts = NumElts / 2;
52643 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52644 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52645 N1.getConstantOperandAPInt(1) == HalfElts &&
52646 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52647 return DAG.getNode(
52648 ISD::CONCAT_VECTORS, dl, VT,
52649 extractSubVector(N0, 0, DAG, dl, HalfElts),
52650 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52651 }
52652 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52653 N0.getConstantOperandAPInt(1) == HalfElts &&
52654 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52655 return DAG.getNode(
52656 ISD::CONCAT_VECTORS, dl, VT,
52657 extractSubVector(N1, 0, DAG, dl, HalfElts),
52658 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52659 }
52660 }
52661
52662 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52663 // Attempt to recursively combine an OR of shuffles.
52664 SDValue Op(N, 0);
52665 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52666 return Res;
52667
52668 // If either operand is a constant mask, then only the elements that aren't
52669 // allones are actually demanded by the other operand.
52670 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52671 APInt UndefElts;
52672 SmallVector<APInt> EltBits;
52673 int NumElts = VT.getVectorNumElements();
52674 int EltSizeInBits = VT.getScalarSizeInBits();
52675 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52676 return false;
52677
52678 APInt DemandedElts = APInt::getZero(NumElts);
52679 for (int I = 0; I != NumElts; ++I)
52680 if (!EltBits[I].isAllOnes())
52681 DemandedElts.setBit(I);
52682
52683 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52684 };
52685 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52686 if (N->getOpcode() != ISD::DELETED_NODE)
52687 DCI.AddToWorklist(N);
52688 return SDValue(N, 0);
52689 }
52690 }
52691
52692 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52693 return R;
52694
52695 return SDValue();
52696}
52697
52698/// Try to turn tests against the signbit in the form of:
52699/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52700/// into:
52701/// SETGT(X, -1)
52703 SelectionDAG &DAG) {
52704 // This is only worth doing if the output type is i8 or i1.
52705 EVT ResultType = N->getValueType(0);
52706 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52707 return SDValue();
52708
52709 SDValue N0 = N->getOperand(0);
52710 SDValue N1 = N->getOperand(1);
52711
52712 // We should be performing an xor against a truncated shift.
52713 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52714 return SDValue();
52715
52716 // Make sure we are performing an xor against one.
52717 if (!isOneConstant(N1))
52718 return SDValue();
52719
52720 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52721 SDValue Shift = N0.getOperand(0);
52722 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52723 return SDValue();
52724
52725 // Make sure we are truncating from one of i16, i32 or i64.
52726 EVT ShiftTy = Shift.getValueType();
52727 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52728 return SDValue();
52729
52730 // Make sure the shift amount extracts the sign bit.
52731 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52732 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52733 return SDValue();
52734
52735 // Create a greater-than comparison against -1.
52736 // N.B. Using SETGE against 0 works but we want a canonical looking
52737 // comparison, using SETGT matches up with what TranslateX86CC.
52738 SDValue ShiftOp = Shift.getOperand(0);
52739 EVT ShiftOpTy = ShiftOp.getValueType();
52740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52741 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52742 *DAG.getContext(), ResultType);
52743 SDValue Cond =
52744 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52745 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52746 if (SetCCResultType != ResultType)
52747 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52748 return Cond;
52749}
52750
52751/// Turn vector tests of the signbit in the form of:
52752/// xor (sra X, elt_size(X)-1), -1
52753/// into:
52754/// pcmpgt X, -1
52755///
52756/// This should be called before type legalization because the pattern may not
52757/// persist after that.
52759 const X86Subtarget &Subtarget) {
52760 EVT VT = N->getValueType(0);
52761 if (!VT.isSimple())
52762 return SDValue();
52763
52764 switch (VT.getSimpleVT().SimpleTy) {
52765 // clang-format off
52766 default: return SDValue();
52767 case MVT::v16i8:
52768 case MVT::v8i16:
52769 case MVT::v4i32:
52770 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52771 case MVT::v32i8:
52772 case MVT::v16i16:
52773 case MVT::v8i32:
52774 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52775 // clang-format on
52776 }
52777
52778 // There must be a shift right algebraic before the xor, and the xor must be a
52779 // 'not' operation.
52780 SDValue Shift = N->getOperand(0);
52781 SDValue Ones = N->getOperand(1);
52782 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52784 return SDValue();
52785
52786 // The shift should be smearing the sign bit across each vector element.
52787 auto *ShiftAmt =
52788 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52789 if (!ShiftAmt ||
52790 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52791 return SDValue();
52792
52793 // Create a greater-than comparison against -1. We don't use the more obvious
52794 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52795 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52796}
52797
52798/// Detect patterns of truncation with unsigned saturation:
52799///
52800/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52801/// Return the source value x to be truncated or SDValue() if the pattern was
52802/// not matched.
52803///
52804/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52805/// where C1 >= 0 and C2 is unsigned max of destination type.
52806///
52807/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52808/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52809///
52810/// These two patterns are equivalent to:
52811/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52812/// So return the smax(x, C1) value to be truncated or SDValue() if the
52813/// pattern was not matched.
52815 const SDLoc &DL) {
52816 using namespace llvm::SDPatternMatch;
52817 EVT InVT = In.getValueType();
52818
52819 // Saturation with truncation. We truncate from InVT to VT.
52821 "Unexpected types for truncate operation");
52822
52823 APInt C1, C2;
52825
52826 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52827 // the element size of the destination type.
52828 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52829 C2.isMask(VT.getScalarSizeInBits()))
52830 return UMin;
52831
52832 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52834 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52835 return SMin;
52836
52837 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52839 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52840 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52841
52842 return SDValue();
52843}
52844
52845/// Detect patterns of truncation with signed saturation:
52846/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52847/// signed_max_of_dest_type)) to dest_type)
52848/// or:
52849/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52850/// signed_min_of_dest_type)) to dest_type).
52851/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52852/// Return the source value to be truncated or SDValue() if the pattern was not
52853/// matched.
52854static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52855 using namespace llvm::SDPatternMatch;
52856 unsigned NumDstBits = VT.getScalarSizeInBits();
52857 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52858 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52859
52860 APInt SignedMax, SignedMin;
52861 if (MatchPackUS) {
52862 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52863 SignedMin = APInt::getZero(NumSrcBits);
52864 } else {
52865 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52866 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52867 }
52868
52869 SDValue SMin, SMax;
52870 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52871 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52872 return SMax;
52873
52874 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52875 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52876 return SMin;
52877
52878 return SDValue();
52879}
52880
52882 SelectionDAG &DAG,
52883 const X86Subtarget &Subtarget) {
52884 if (!Subtarget.hasSSE2() || !VT.isVector())
52885 return SDValue();
52886
52887 EVT SVT = VT.getVectorElementType();
52888 EVT InVT = In.getValueType();
52889 EVT InSVT = InVT.getVectorElementType();
52890
52891 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52892 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52893 // and concatenate at the same time. Then we can use a final vpmovuswb to
52894 // clip to 0-255.
52895 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52896 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52897 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52898 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52899 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52900 DL, DAG, Subtarget);
52901 assert(Mid && "Failed to pack!");
52902 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52903 }
52904 }
52905
52906 // vXi32 truncate instructions are available with AVX512F.
52907 // vXi16 truncate instructions are only available with AVX512BW.
52908 // For 256-bit or smaller vectors, we require VLX.
52909 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52910 // If the result type is 256-bits or larger and we have disable 512-bit
52911 // registers, we should go ahead and use the pack instructions if possible.
52912 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52913 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52914 (InVT.getSizeInBits() > 128) &&
52915 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52916 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52917
52918 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52920 (SVT == MVT::i8 || SVT == MVT::i16) &&
52921 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52922 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52923 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52924 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52925 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52926 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52927 DAG, Subtarget);
52928 assert(Mid && "Failed to pack!");
52930 Subtarget);
52931 assert(V && "Failed to pack!");
52932 return V;
52933 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52934 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52935 Subtarget);
52936 }
52937 if (SDValue SSatVal = detectSSatPattern(In, VT))
52938 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52939 Subtarget);
52940 }
52941
52942 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52943 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52944 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52945 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52946 unsigned TruncOpc = 0;
52947 SDValue SatVal;
52948 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52949 SatVal = SSatVal;
52950 TruncOpc = X86ISD::VTRUNCS;
52951 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52952 SatVal = USatVal;
52953 TruncOpc = X86ISD::VTRUNCUS;
52954 }
52955 if (SatVal) {
52956 unsigned ResElts = VT.getVectorNumElements();
52957 // If the input type is less than 512 bits and we don't have VLX, we need
52958 // to widen to 512 bits.
52959 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52960 unsigned NumConcats = 512 / InVT.getSizeInBits();
52961 ResElts *= NumConcats;
52962 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52963 ConcatOps[0] = SatVal;
52964 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52965 NumConcats * InVT.getVectorNumElements());
52966 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52967 }
52968 // Widen the result if its narrower than 128 bits.
52969 if (ResElts * SVT.getSizeInBits() < 128)
52970 ResElts = 128 / SVT.getSizeInBits();
52971 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52972 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52973 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52974 DAG.getVectorIdxConstant(0, DL));
52975 }
52976 }
52977
52978 return SDValue();
52979}
52980
52982 SelectionDAG &DAG,
52984 const X86Subtarget &Subtarget) {
52985 auto *Ld = cast<LoadSDNode>(N);
52986 EVT RegVT = Ld->getValueType(0);
52987 SDValue Ptr = Ld->getBasePtr();
52988 SDValue Chain = Ld->getChain();
52989 ISD::LoadExtType Ext = Ld->getExtensionType();
52990
52991 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52992 return SDValue();
52993
52994 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52995 return SDValue();
52996
52998 if (!LdC)
52999 return SDValue();
53000
53001 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53002 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53003 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53004 if (Undefs[I])
53005 continue;
53006 if (UserUndefs[I] || Bits[I] != UserBits[I])
53007 return false;
53008 }
53009 return true;
53010 };
53011
53012 // Look through all other loads/broadcasts in the chain for another constant
53013 // pool entry.
53014 for (SDNode *User : Chain->users()) {
53015 auto *UserLd = dyn_cast<MemSDNode>(User);
53016 if (User != N && UserLd &&
53017 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53018 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53020 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53021 User->getValueSizeInBits(0).getFixedValue() >
53022 RegVT.getFixedSizeInBits()) {
53023 EVT UserVT = User->getValueType(0);
53024 SDValue UserPtr = UserLd->getBasePtr();
53025 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53026
53027 // See if we are loading a constant that matches in the lower
53028 // bits of a longer constant (but from a different constant pool ptr).
53029 if (UserC && UserPtr != Ptr) {
53030 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53031 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53032 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53033 APInt Undefs, UserUndefs;
53034 SmallVector<APInt> Bits, UserBits;
53035 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53036 UserVT.getScalarSizeInBits());
53037 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53038 Bits) &&
53040 UserUndefs, UserBits)) {
53041 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53043 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53044 RegVT.getSizeInBits());
53045 Extract = DAG.getBitcast(RegVT, Extract);
53046 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53047 }
53048 }
53049 }
53050 }
53051 }
53052 }
53053
53054 return SDValue();
53055}
53056
53059 const X86Subtarget &Subtarget) {
53060 auto *Ld = cast<LoadSDNode>(N);
53061 EVT RegVT = Ld->getValueType(0);
53062 EVT MemVT = Ld->getMemoryVT();
53063 SDLoc dl(Ld);
53064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53065
53066 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53067 // into two 16-byte operations. Also split non-temporal aligned loads on
53068 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53069 ISD::LoadExtType Ext = Ld->getExtensionType();
53070 unsigned Fast;
53071 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53072 Ext == ISD::NON_EXTLOAD &&
53073 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53074 Ld->getAlign() >= Align(16)) ||
53075 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53076 *Ld->getMemOperand(), &Fast) &&
53077 !Fast))) {
53078 unsigned NumElems = RegVT.getVectorNumElements();
53079 if (NumElems < 2)
53080 return SDValue();
53081
53082 unsigned HalfOffset = 16;
53083 SDValue Ptr1 = Ld->getBasePtr();
53084 SDValue Ptr2 =
53085 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53086 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53087 NumElems / 2);
53088 SDValue Load1 =
53089 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53090 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53091 SDValue Load2 =
53092 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53093 Ld->getPointerInfo().getWithOffset(HalfOffset),
53094 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53095 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53096 Load1.getValue(1), Load2.getValue(1));
53097
53098 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53099 return DCI.CombineTo(N, NewVec, TF, true);
53100 }
53101
53102 // Bool vector load - attempt to cast to an integer, as we have good
53103 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53104 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53105 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53106 unsigned NumElts = RegVT.getVectorNumElements();
53107 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53108 if (TLI.isTypeLegal(IntVT)) {
53109 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53110 Ld->getPointerInfo(), Ld->getBaseAlign(),
53111 Ld->getMemOperand()->getFlags());
53112 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53113 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53114 }
53115 }
53116
53117 // If we also broadcast this vector to a wider type, then just extract the
53118 // lowest subvector.
53119 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53120 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53121 SDValue Ptr = Ld->getBasePtr();
53122 SDValue Chain = Ld->getChain();
53123 for (SDNode *User : Chain->users()) {
53124 auto *UserLd = dyn_cast<MemSDNode>(User);
53125 if (User != N && UserLd &&
53126 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53127 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53128 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53129 User->hasAnyUseOfValue(0) &&
53130 User->getValueSizeInBits(0).getFixedValue() >
53131 RegVT.getFixedSizeInBits()) {
53133 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53134 RegVT.getSizeInBits());
53135 Extract = DAG.getBitcast(RegVT, Extract);
53136 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53137 }
53138 }
53139 }
53140
53141 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53142 return V;
53143
53144 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53145 unsigned AddrSpace = Ld->getAddressSpace();
53146 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53147 AddrSpace == X86AS::PTR32_UPTR) {
53148 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53149 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53150 SDValue Cast =
53151 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53152 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53153 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53154 Ld->getMemOperand()->getFlags());
53155 }
53156 }
53157
53158 return SDValue();
53159}
53160
53161/// If V is a build vector of boolean constants and exactly one of those
53162/// constants is true, return the operand index of that true element.
53163/// Otherwise, return -1.
53164static int getOneTrueElt(SDValue V) {
53165 // This needs to be a build vector of booleans.
53166 // TODO: Checking for the i1 type matches the IR definition for the mask,
53167 // but the mask check could be loosened to i8 or other types. That might
53168 // also require checking more than 'allOnesValue'; eg, the x86 HW
53169 // instructions only require that the MSB is set for each mask element.
53170 // The ISD::MSTORE comments/definition do not specify how the mask operand
53171 // is formatted.
53172 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53173 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53174 return -1;
53175
53176 int TrueIndex = -1;
53177 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53178 for (unsigned i = 0; i < NumElts; ++i) {
53179 const SDValue &Op = BV->getOperand(i);
53180 if (Op.isUndef())
53181 continue;
53182 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53183 if (!ConstNode)
53184 return -1;
53185 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53186 // If we already found a one, this is too many.
53187 if (TrueIndex >= 0)
53188 return -1;
53189 TrueIndex = i;
53190 }
53191 }
53192 return TrueIndex;
53193}
53194
53195/// Given a masked memory load/store operation, return true if it has one mask
53196/// bit set. If it has one mask bit set, then also return the memory address of
53197/// the scalar element to load/store, the vector index to insert/extract that
53198/// scalar element, and the alignment for the scalar memory access.
53200 SelectionDAG &DAG, SDValue &Addr,
53201 SDValue &Index, Align &Alignment,
53202 unsigned &Offset) {
53203 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53204 if (TrueMaskElt < 0)
53205 return false;
53206
53207 // Get the address of the one scalar element that is specified by the mask
53208 // using the appropriate offset from the base pointer.
53209 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53210 Offset = 0;
53211 Addr = MaskedOp->getBasePtr();
53212 if (TrueMaskElt != 0) {
53213 Offset = TrueMaskElt * EltVT.getStoreSize();
53215 SDLoc(MaskedOp));
53216 }
53217
53218 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53219 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53220 return true;
53221}
53222
53223/// If exactly one element of the mask is set for a non-extending masked load,
53224/// it is a scalar load and vector insert.
53225/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53226/// mask have already been optimized in IR, so we don't bother with those here.
53227static SDValue
53230 const X86Subtarget &Subtarget) {
53231 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53232 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53233 // However, some target hooks may need to be added to know when the transform
53234 // is profitable. Endianness would also have to be considered.
53235
53236 SDValue Addr, VecIndex;
53237 Align Alignment;
53238 unsigned Offset;
53239 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53240 return SDValue();
53241
53242 // Load the one scalar element that is specified by the mask using the
53243 // appropriate offset from the base pointer.
53244 SDLoc DL(ML);
53245 EVT VT = ML->getValueType(0);
53246 EVT EltVT = VT.getVectorElementType();
53247
53248 EVT CastVT = VT;
53249 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53250 EltVT = MVT::f64;
53251 CastVT = VT.changeVectorElementType(EltVT);
53252 }
53253
53254 SDValue Load =
53255 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53256 ML->getPointerInfo().getWithOffset(Offset),
53257 Alignment, ML->getMemOperand()->getFlags());
53258
53259 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53260
53261 // Insert the loaded element into the appropriate place in the vector.
53262 SDValue Insert =
53263 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53264 Insert = DAG.getBitcast(VT, Insert);
53265 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53266}
53267
53268static SDValue
53271 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53272 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53273 return SDValue();
53274
53275 SDLoc DL(ML);
53276 EVT VT = ML->getValueType(0);
53277
53278 // If we are loading the first and last elements of a vector, it is safe and
53279 // always faster to load the whole vector. Replace the masked load with a
53280 // vector load and select.
53281 unsigned NumElts = VT.getVectorNumElements();
53282 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53283 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53284 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53285 if (LoadFirstElt && LoadLastElt) {
53286 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53287 ML->getMemOperand());
53288 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53289 ML->getPassThru());
53290 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53291 }
53292
53293 // Convert a masked load with a constant mask into a masked load and a select.
53294 // This allows the select operation to use a faster kind of select instruction
53295 // (for example, vblendvps -> vblendps).
53296
53297 // Don't try this if the pass-through operand is already undefined. That would
53298 // cause an infinite loop because that's what we're about to create.
53299 if (ML->getPassThru().isUndef())
53300 return SDValue();
53301
53302 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53303 return SDValue();
53304
53305 // The new masked load has an undef pass-through operand. The select uses the
53306 // original pass-through operand.
53307 SDValue NewML = DAG.getMaskedLoad(
53308 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53309 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53310 ML->getAddressingMode(), ML->getExtensionType());
53311 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53312 ML->getPassThru());
53313
53314 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53315}
53316
53319 const X86Subtarget &Subtarget) {
53320 auto *Mld = cast<MaskedLoadSDNode>(N);
53321
53322 // TODO: Expanding load with constant mask may be optimized as well.
53323 if (Mld->isExpandingLoad())
53324 return SDValue();
53325
53326 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53327 if (SDValue ScalarLoad =
53328 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53329 return ScalarLoad;
53330
53331 // TODO: Do some AVX512 subsets benefit from this transform?
53332 if (!Subtarget.hasAVX512())
53333 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53334 return Blend;
53335 }
53336
53337 // If the mask value has been legalized to a non-boolean vector, try to
53338 // simplify ops leading up to it. We only demand the MSB of each lane.
53339 SDValue Mask = Mld->getMask();
53340 if (Mask.getScalarValueSizeInBits() != 1) {
53341 EVT VT = Mld->getValueType(0);
53342 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53344 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53345 if (N->getOpcode() != ISD::DELETED_NODE)
53346 DCI.AddToWorklist(N);
53347 return SDValue(N, 0);
53348 }
53349 if (SDValue NewMask =
53351 return DAG.getMaskedLoad(
53352 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53353 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53354 Mld->getAddressingMode(), Mld->getExtensionType());
53355 }
53356
53357 return SDValue();
53358}
53359
53360/// If exactly one element of the mask is set for a non-truncating masked store,
53361/// it is a vector extract and scalar store.
53362/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53363/// mask have already been optimized in IR, so we don't bother with those here.
53365 SelectionDAG &DAG,
53366 const X86Subtarget &Subtarget) {
53367 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53368 // However, some target hooks may need to be added to know when the transform
53369 // is profitable. Endianness would also have to be considered.
53370
53371 SDValue Addr, VecIndex;
53372 Align Alignment;
53373 unsigned Offset;
53374 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53375 return SDValue();
53376
53377 // Extract the one scalar element that is actually being stored.
53378 SDLoc DL(MS);
53379 SDValue Value = MS->getValue();
53380 EVT VT = Value.getValueType();
53381 EVT EltVT = VT.getVectorElementType();
53382 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53383 EltVT = MVT::f64;
53384 EVT CastVT = VT.changeVectorElementType(EltVT);
53385 Value = DAG.getBitcast(CastVT, Value);
53386 }
53387 SDValue Extract =
53388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53389
53390 // Store that element at the appropriate offset from the base pointer.
53391 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53393 Alignment, MS->getMemOperand()->getFlags());
53394}
53395
53398 const X86Subtarget &Subtarget) {
53400 if (Mst->isCompressingStore())
53401 return SDValue();
53402
53403 EVT VT = Mst->getValue().getValueType();
53404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53405
53406 if (Mst->isTruncatingStore())
53407 return SDValue();
53408
53409 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53410 return ScalarStore;
53411
53412 // If the mask value has been legalized to a non-boolean vector, try to
53413 // simplify ops leading up to it. We only demand the MSB of each lane.
53414 SDValue Mask = Mst->getMask();
53415 if (Mask.getScalarValueSizeInBits() != 1) {
53417 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53418 if (N->getOpcode() != ISD::DELETED_NODE)
53419 DCI.AddToWorklist(N);
53420 return SDValue(N, 0);
53421 }
53422 if (SDValue NewMask =
53424 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53425 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53426 Mst->getMemoryVT(), Mst->getMemOperand(),
53427 Mst->getAddressingMode());
53428 }
53429
53430 SDValue Value = Mst->getValue();
53431 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53432 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53433 Mst->getMemoryVT())) {
53434 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53435 Mst->getBasePtr(), Mst->getOffset(), Mask,
53436 Mst->getMemoryVT(), Mst->getMemOperand(),
53437 Mst->getAddressingMode(), true);
53438 }
53439
53440 return SDValue();
53441}
53442
53445 const X86Subtarget &Subtarget) {
53447 EVT StVT = St->getMemoryVT();
53448 SDLoc dl(St);
53449 SDValue StoredVal = St->getValue();
53450 EVT VT = StoredVal.getValueType();
53451 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53452
53453 // Convert a store of vXi1 into a store of iX and a bitcast.
53454 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53455 VT.getVectorElementType() == MVT::i1) {
53456
53458 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53459
53460 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53461 St->getPointerInfo(), St->getBaseAlign(),
53462 St->getMemOperand()->getFlags());
53463 }
53464
53465 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53466 // This will avoid a copy to k-register.
53467 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53468 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53469 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53470 SDValue Val = StoredVal.getOperand(0);
53471 // We must store zeros to the unused bits.
53472 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53473 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53474 St->getPointerInfo(), St->getBaseAlign(),
53475 St->getMemOperand()->getFlags());
53476 }
53477
53478 // Widen v2i1/v4i1 stores to v8i1.
53479 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53480 Subtarget.hasAVX512()) {
53481 unsigned NumConcats = 8 / VT.getVectorNumElements();
53482 // We must store zeros to the unused bits.
53483 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53484 Ops[0] = StoredVal;
53485 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53486 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53487 St->getPointerInfo(), St->getBaseAlign(),
53488 St->getMemOperand()->getFlags());
53489 }
53490
53491 // Turn vXi1 stores of constants into a scalar store.
53492 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53493 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53495 // If its a v64i1 store without 64-bit support, we need two stores.
53496 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53497 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53498 StoredVal->ops().slice(0, 32));
53500 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53501 StoredVal->ops().slice(32, 32));
53503
53504 SDValue Ptr0 = St->getBasePtr();
53505 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53506
53507 SDValue Ch0 =
53508 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53509 St->getBaseAlign(), St->getMemOperand()->getFlags());
53510 SDValue Ch1 = DAG.getStore(
53511 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53512 St->getBaseAlign(), St->getMemOperand()->getFlags());
53513 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53514 }
53515
53516 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53517 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53518 St->getPointerInfo(), St->getBaseAlign(),
53519 St->getMemOperand()->getFlags());
53520 }
53521
53522 // Convert scalar fabs/fneg load-store to integer equivalents.
53523 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53524 (StoredVal.getOpcode() == ISD::FABS ||
53525 StoredVal.getOpcode() == ISD::FNEG) &&
53526 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53527 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53528 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53529 if (TLI.isTypeLegal(IntVT)) {
53531 unsigned SignOp = ISD::XOR;
53532 if (StoredVal.getOpcode() == ISD::FABS) {
53533 SignMask = ~SignMask;
53534 SignOp = ISD::AND;
53535 }
53536 SDValue LogicOp = DAG.getNode(
53537 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53538 DAG.getConstant(SignMask, dl, IntVT));
53539 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53540 St->getPointerInfo(), St->getBaseAlign(),
53541 St->getMemOperand()->getFlags());
53542 }
53543 }
53544
53545 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53546 // Sandy Bridge, perform two 16-byte stores.
53547 unsigned Fast;
53548 if (VT.is256BitVector() && StVT == VT &&
53549 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53550 *St->getMemOperand(), &Fast) &&
53551 !Fast) {
53552 unsigned NumElems = VT.getVectorNumElements();
53553 if (NumElems < 2)
53554 return SDValue();
53555
53556 return splitVectorStore(St, DAG);
53557 }
53558
53559 // Split under-aligned vector non-temporal stores.
53560 if (St->isNonTemporal() && StVT == VT &&
53561 St->getAlign().value() < VT.getStoreSize()) {
53562 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53563 // vectors or the legalizer can scalarize it to use MOVNTI.
53564 if (VT.is256BitVector() || VT.is512BitVector()) {
53565 unsigned NumElems = VT.getVectorNumElements();
53566 if (NumElems < 2)
53567 return SDValue();
53568 return splitVectorStore(St, DAG);
53569 }
53570
53571 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53572 // to use MOVNTI.
53573 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53574 MVT NTVT = Subtarget.hasSSE4A()
53575 ? MVT::v2f64
53576 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53577 return scalarizeVectorStore(St, NTVT, DAG);
53578 }
53579 }
53580
53581 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53582 // supported, but avx512f is by extending to v16i32 and truncating.
53583 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53584 St->getValue().getOpcode() == ISD::TRUNCATE &&
53585 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53586 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53587 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53588 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53589 St->getValue().getOperand(0));
53590 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53591 MVT::v16i8, St->getMemOperand());
53592 }
53593
53594 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53595 if (!St->isTruncatingStore() &&
53596 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53597 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53598 StoredVal.hasOneUse() &&
53599 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53600 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53601 return EmitTruncSStore(IsSigned, St->getChain(),
53602 dl, StoredVal.getOperand(0), St->getBasePtr(),
53603 VT, St->getMemOperand(), DAG);
53604 }
53605
53606 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53607 if (!St->isTruncatingStore()) {
53608 auto IsExtractedElement = [](SDValue V) {
53609 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53610 V = V.getOperand(0);
53611 unsigned Opc = V.getOpcode();
53613 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53614 V.getOperand(0).hasOneUse())
53615 return V.getOperand(0);
53616 return SDValue();
53617 };
53618 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53619 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53620 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53621 SDValue Src = Trunc.getOperand(0);
53622 MVT DstVT = Trunc.getSimpleValueType();
53623 MVT SrcVT = Src.getSimpleValueType();
53624 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53625 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53626 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53627 if (NumTruncBits == VT.getSizeInBits() &&
53628 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53629 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53630 TruncVT, St->getMemOperand());
53631 }
53632 }
53633 }
53634 }
53635
53636 // Optimize trunc store (of multiple scalars) to shuffle and store.
53637 // First, pack all of the elements in one place. Next, store to memory
53638 // in fewer chunks.
53639 if (St->isTruncatingStore() && VT.isVector()) {
53640 if (TLI.isTruncStoreLegal(VT, StVT)) {
53641 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53642 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53643 dl, Val, St->getBasePtr(),
53644 St->getMemoryVT(), St->getMemOperand(), DAG);
53645 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53646 DAG, dl))
53647 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53648 dl, Val, St->getBasePtr(),
53649 St->getMemoryVT(), St->getMemOperand(), DAG);
53650 }
53651
53652 return SDValue();
53653 }
53654
53655 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53656 unsigned AddrSpace = St->getAddressSpace();
53657 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53658 AddrSpace == X86AS::PTR32_UPTR) {
53659 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53660 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53661 SDValue Cast =
53662 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53663 return DAG.getTruncStore(
53664 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53665 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53666 }
53667 }
53668
53669 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53670 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53671 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53672 Subtarget.hasCF() && St->isSimple()) {
53673 SDValue Cmov;
53674 if (StoredVal.getOpcode() == X86ISD::CMOV)
53675 Cmov = StoredVal;
53676 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53677 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53678 Cmov = StoredVal.getOperand(0);
53679 else
53680 return SDValue();
53681
53682 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53683 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53684 return SDValue();
53685
53686 bool InvertCC = false;
53687 SDValue V = SDValue(Ld, 0);
53688 if (V == Cmov.getOperand(1))
53689 InvertCC = true;
53690 else if (V != Cmov.getOperand(0))
53691 return SDValue();
53692
53693 SDVTList Tys = DAG.getVTList(MVT::Other);
53694 SDValue CC = Cmov.getOperand(2);
53695 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53696 if (InvertCC)
53697 CC = DAG.getTargetConstant(
53700 dl, MVT::i8);
53701 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53702 Cmov.getOperand(3)};
53703 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53704 St->getMemOperand());
53705 }
53706
53707 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53708 // the FP state in cases where an emms may be missing.
53709 // A preferable solution to the general problem is to figure out the right
53710 // places to insert EMMS. This qualifies as a quick hack.
53711
53712 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53713 if (VT.getSizeInBits() != 64)
53714 return SDValue();
53715
53716 const Function &F = DAG.getMachineFunction().getFunction();
53717 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53718 bool F64IsLegal =
53719 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53720
53721 if (!F64IsLegal || Subtarget.is64Bit())
53722 return SDValue();
53723
53724 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53725 cast<LoadSDNode>(St->getValue())->isSimple() &&
53726 St->getChain().hasOneUse() && St->isSimple()) {
53727 auto *Ld = cast<LoadSDNode>(St->getValue());
53728
53729 if (!ISD::isNormalLoad(Ld))
53730 return SDValue();
53731
53732 // Avoid the transformation if there are multiple uses of the loaded value.
53733 if (!Ld->hasNUsesOfValue(1, 0))
53734 return SDValue();
53735
53736 SDLoc LdDL(Ld);
53737 SDLoc StDL(N);
53738
53739 // Remove any range metadata as we're converting to f64 load/store.
53740 Ld->getMemOperand()->clearRanges();
53741
53742 // Lower to a single movq load/store pair.
53743 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53744 Ld->getBasePtr(), Ld->getMemOperand());
53745
53746 // Make sure new load is placed in same chain order.
53747 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53748 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53749 St->getMemOperand());
53750 }
53751
53752 // This is similar to the above case, but here we handle a scalar 64-bit
53753 // integer store that is extracted from a vector on a 32-bit target.
53754 // If we have SSE2, then we can treat it like a floating-point double
53755 // to get past legalization. The execution dependencies fixup pass will
53756 // choose the optimal machine instruction for the store if this really is
53757 // an integer or v2f32 rather than an f64.
53758 if (VT == MVT::i64 &&
53760 SDValue OldExtract = St->getOperand(1);
53761 SDValue ExtOp0 = OldExtract.getOperand(0);
53762 unsigned VecSize = ExtOp0.getValueSizeInBits();
53763 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53764 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53765 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53766 BitCast, OldExtract.getOperand(1));
53767 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53768 St->getPointerInfo(), St->getBaseAlign(),
53769 St->getMemOperand()->getFlags());
53770 }
53771
53772 return SDValue();
53773}
53774
53777 const X86Subtarget &Subtarget) {
53778 auto *St = cast<MemIntrinsicSDNode>(N);
53779
53780 SDValue StoredVal = N->getOperand(1);
53781 MVT VT = StoredVal.getSimpleValueType();
53782 EVT MemVT = St->getMemoryVT();
53783
53784 // Figure out which elements we demand.
53785 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53786 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53787
53788 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53789 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53790 if (N->getOpcode() != ISD::DELETED_NODE)
53791 DCI.AddToWorklist(N);
53792 return SDValue(N, 0);
53793 }
53794
53795 return SDValue();
53796}
53797
53798/// Return 'true' if this vector operation is "horizontal"
53799/// and return the operands for the horizontal operation in LHS and RHS. A
53800/// horizontal operation performs the binary operation on successive elements
53801/// of its first operand, then on successive elements of its second operand,
53802/// returning the resulting values in a vector. For example, if
53803/// A = < float a0, float a1, float a2, float a3 >
53804/// and
53805/// B = < float b0, float b1, float b2, float b3 >
53806/// then the result of doing a horizontal operation on A and B is
53807/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53808/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53809/// A horizontal-op B, for some already available A and B, and if so then LHS is
53810/// set to A, RHS to B, and the routine returns 'true'.
53811static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53812 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53813 bool IsCommutative,
53814 SmallVectorImpl<int> &PostShuffleMask,
53815 bool ForceHorizOp) {
53816 // If either operand is undef, bail out. The binop should be simplified.
53817 if (LHS.isUndef() || RHS.isUndef())
53818 return false;
53819
53820 // Look for the following pattern:
53821 // A = < float a0, float a1, float a2, float a3 >
53822 // B = < float b0, float b1, float b2, float b3 >
53823 // and
53824 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53825 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53826 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53827 // which is A horizontal-op B.
53828
53829 MVT VT = LHS.getSimpleValueType();
53830 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53831 "Unsupported vector type for horizontal add/sub");
53832 unsigned NumElts = VT.getVectorNumElements();
53833
53834 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53835 SmallVectorImpl<int> &ShuffleMask) {
53836 bool UseSubVector = false;
53837 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53838 Op.getOperand(0).getValueType().is256BitVector() &&
53839 llvm::isNullConstant(Op.getOperand(1))) {
53840 Op = Op.getOperand(0);
53841 UseSubVector = true;
53842 }
53844 SmallVector<int, 16> SrcMask, ScaledMask;
53846 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53847 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53848 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53849 })) {
53850 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53851 if (!UseSubVector && SrcOps.size() <= 2 &&
53852 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53853 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53854 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53855 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53856 }
53857 if (UseSubVector && SrcOps.size() == 1 &&
53858 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53859 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53860 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53861 ShuffleMask.assign(Mask.begin(), Mask.end());
53862 }
53863 }
53864 };
53865
53866 // View LHS in the form
53867 // LHS = VECTOR_SHUFFLE A, B, LMask
53868 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53869 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53870 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53871 SDValue A, B;
53873 GetShuffle(LHS, A, B, LMask);
53874
53875 // Likewise, view RHS in the form
53876 // RHS = VECTOR_SHUFFLE C, D, RMask
53877 SDValue C, D;
53879 GetShuffle(RHS, C, D, RMask);
53880
53881 // At least one of the operands should be a vector shuffle.
53882 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53883 if (NumShuffles == 0)
53884 return false;
53885
53886 if (LMask.empty()) {
53887 A = LHS;
53888 for (unsigned i = 0; i != NumElts; ++i)
53889 LMask.push_back(i);
53890 }
53891
53892 if (RMask.empty()) {
53893 C = RHS;
53894 for (unsigned i = 0; i != NumElts; ++i)
53895 RMask.push_back(i);
53896 }
53897
53898 // If we have an unary mask, ensure the other op is set to null.
53899 if (isUndefOrInRange(LMask, 0, NumElts))
53900 B = SDValue();
53901 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53902 A = SDValue();
53903
53904 if (isUndefOrInRange(RMask, 0, NumElts))
53905 D = SDValue();
53906 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53907 C = SDValue();
53908
53909 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53910 // RHS operands and shuffle mask.
53911 if (A != C) {
53912 std::swap(C, D);
53914 }
53915 // Check that the shuffles are both shuffling the same vectors.
53916 if (!(A == C && B == D))
53917 return false;
53918
53919 PostShuffleMask.clear();
53920 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53921
53922 // LHS and RHS are now:
53923 // LHS = shuffle A, B, LMask
53924 // RHS = shuffle A, B, RMask
53925 // Check that the masks correspond to performing a horizontal operation.
53926 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53927 // so we just repeat the inner loop if this is a 256-bit op.
53928 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53929 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53930 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53931 assert((NumEltsPer128BitChunk % 2 == 0) &&
53932 "Vector type should have an even number of elements in each lane");
53933 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53934 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53935 // Ignore undefined components.
53936 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53937 if (LIdx < 0 || RIdx < 0 ||
53938 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53939 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53940 continue;
53941
53942 // Check that successive odd/even elements are being operated on. If not,
53943 // this is not a horizontal operation.
53944 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53945 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53946 return false;
53947
53948 // Compute the post-shuffle mask index based on where the element
53949 // is stored in the HOP result, and where it needs to be moved to.
53950 int Base = LIdx & ~1u;
53951 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53952 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53953
53954 // The low half of the 128-bit result must choose from A.
53955 // The high half of the 128-bit result must choose from B,
53956 // unless B is undef. In that case, we are always choosing from A.
53957 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53958 Index += NumEltsPer64BitChunk;
53959 PostShuffleMask[i + j] = Index;
53960 }
53961 }
53962
53963 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53964 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53965
53966 bool IsIdentityPostShuffle =
53967 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53968 if (IsIdentityPostShuffle)
53969 PostShuffleMask.clear();
53970
53971 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53972 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53973 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53974 return false;
53975
53976 // If the source nodes are already used in HorizOps then always accept this.
53977 // Shuffle folding should merge these back together.
53978 auto FoundHorizUser = [&](SDNode *User) {
53979 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53980 };
53981 ForceHorizOp =
53982 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53983 llvm::any_of(NewRHS->users(), FoundHorizUser));
53984
53985 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53986 // shuffle the result.
53987 if (!ForceHorizOp &&
53988 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53989 (NumShuffles < 2 || !IsIdentityPostShuffle),
53990 DAG, Subtarget))
53991 return false;
53992
53993 LHS = DAG.getBitcast(VT, NewLHS);
53994 RHS = DAG.getBitcast(VT, NewRHS);
53995 return true;
53996}
53997
53998// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54000 const X86Subtarget &Subtarget) {
54001 EVT VT = N->getValueType(0);
54002 unsigned Opcode = N->getOpcode();
54003 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54004 SmallVector<int, 8> PostShuffleMask;
54005
54006 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54007 return N->hasOneUse() &&
54008 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54009 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54010 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54011 };
54012
54013 switch (Opcode) {
54014 case ISD::FADD:
54015 case ISD::FSUB:
54016 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54017 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54018 SDValue LHS = N->getOperand(0);
54019 SDValue RHS = N->getOperand(1);
54020 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54021 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54022 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54023 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54024 if (!PostShuffleMask.empty())
54025 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54026 DAG.getUNDEF(VT), PostShuffleMask);
54027 return HorizBinOp;
54028 }
54029 }
54030 break;
54031 case ISD::ADD:
54032 case ISD::SUB:
54033 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54034 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54035 SDValue LHS = N->getOperand(0);
54036 SDValue RHS = N->getOperand(1);
54037 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54038 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54039 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54040 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54042 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54043 };
54044 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54045 {LHS, RHS}, HOpBuilder);
54046 if (!PostShuffleMask.empty())
54047 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54048 DAG.getUNDEF(VT), PostShuffleMask);
54049 return HorizBinOp;
54050 }
54051 }
54052 break;
54053 }
54054
54055 return SDValue();
54056}
54057
54058// Try to combine the following nodes
54059// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54060// <i32 -2147483648[float -0.000000e+00]> 0
54061// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54062// <(load 4 from constant-pool)> t0, t29
54063// [t30: v16i32 = bitcast t27]
54064// t6: v16i32 = xor t7, t27[t30]
54065// t11: v16f32 = bitcast t6
54066// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54067// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54068// t22: v16f32 = bitcast t7
54069// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54070// t24: v32f16 = bitcast t23
54072 const X86Subtarget &Subtarget) {
54073 EVT VT = N->getValueType(0);
54074 SDValue LHS = N->getOperand(0);
54075 SDValue RHS = N->getOperand(1);
54076 int CombineOpcode =
54077 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54078 auto combineConjugation = [&](SDValue &r) {
54079 if (LHS->getOpcode() == ISD::BITCAST) {
54080 SDValue XOR = LHS.getOperand(0);
54081 if (XOR->getOpcode() == ISD::XOR) {
54082 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54083 if (XORRHS.isConstant()) {
54084 APInt ConjugationInt32 = APInt(32, 0x80000000);
54085 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54086 if ((XORRHS.getBitWidth() == 32 &&
54087 XORRHS.getConstant() == ConjugationInt32) ||
54088 (XORRHS.getBitWidth() == 64 &&
54089 XORRHS.getConstant() == ConjugationInt64)) {
54090 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54091 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54092 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54093 r = DAG.getBitcast(VT, FCMulC);
54094 return true;
54095 }
54096 }
54097 }
54098 }
54099 return false;
54100 };
54101 SDValue Res;
54102 if (combineConjugation(Res))
54103 return Res;
54104 std::swap(LHS, RHS);
54105 if (combineConjugation(Res))
54106 return Res;
54107 return Res;
54108}
54109
54110// Try to combine the following nodes:
54111// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54113 const X86Subtarget &Subtarget) {
54114 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54116 Flags.hasAllowContract();
54117 };
54118
54119 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54120 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54121 Flags.hasNoSignedZeros();
54122 };
54123 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54124 APInt AI = APInt(32, 0x80008000);
54125 KnownBits Bits = DAG.computeKnownBits(Op);
54126 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54127 Bits.getConstant() == AI;
54128 };
54129
54130 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54131 !AllowContract(N->getFlags()))
54132 return SDValue();
54133
54134 EVT VT = N->getValueType(0);
54135 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54136 return SDValue();
54137
54138 SDValue LHS = N->getOperand(0);
54139 SDValue RHS = N->getOperand(1);
54140 bool IsConj;
54141 SDValue FAddOp1, MulOp0, MulOp1;
54142 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54143 &IsVectorAllNegativeZero,
54144 &HasNoSignedZero](SDValue N) -> bool {
54145 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54146 return false;
54147 SDValue Op0 = N.getOperand(0);
54148 unsigned Opcode = Op0.getOpcode();
54149 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54150 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54151 MulOp0 = Op0.getOperand(0);
54152 MulOp1 = Op0.getOperand(1);
54153 IsConj = Opcode == X86ISD::VFCMULC;
54154 return true;
54155 }
54156 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54158 HasNoSignedZero(Op0->getFlags())) ||
54159 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54160 MulOp0 = Op0.getOperand(0);
54161 MulOp1 = Op0.getOperand(1);
54162 IsConj = Opcode == X86ISD::VFCMADDC;
54163 return true;
54164 }
54165 }
54166 return false;
54167 };
54168
54169 if (GetCFmulFrom(LHS))
54170 FAddOp1 = RHS;
54171 else if (GetCFmulFrom(RHS))
54172 FAddOp1 = LHS;
54173 else
54174 return SDValue();
54175
54176 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54177 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54178 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54179 // FIXME: How do we handle when fast math flags of FADD are different from
54180 // CFMUL's?
54181 SDValue CFmul =
54182 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54183 return DAG.getBitcast(VT, CFmul);
54184}
54185
54186/// Do target-specific dag combines on floating-point adds/subs.
54188 const X86Subtarget &Subtarget) {
54189 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54190 return HOp;
54191
54192 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54193 return COp;
54194
54195 return SDValue();
54196}
54197
54199 const X86Subtarget &Subtarget) {
54200 EVT VT = N->getValueType(0);
54201 SDValue Src = N->getOperand(0);
54202 EVT SrcVT = Src.getValueType();
54203 SDLoc DL(N);
54204
54205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54206
54207 // Let legalize expand this if it isn't a legal type yet.
54208 if (!TLI.isTypeLegal(VT))
54209 return SDValue();
54210
54211 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54212 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54213 return SDValue();
54214
54215 if (SrcVT == MVT::v2f16) {
54216 SrcVT = MVT::v4f16;
54217 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54218 DAG.getUNDEF(MVT::v2f16));
54219 }
54220
54221 if (SrcVT == MVT::v4f16) {
54222 SrcVT = MVT::v8f16;
54223 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54224 DAG.getUNDEF(MVT::v4f16));
54225 } else if (SrcVT == MVT::v2f32) {
54226 SrcVT = MVT::v4f32;
54227 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54228 DAG.getUNDEF(MVT::v2f32));
54229 } else {
54230 return SDValue();
54231 }
54232
54233 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54234}
54235
54236// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54237// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54238// are able to avoid generating code with MOVABS and large constants in certain
54239// cases.
54241 const SDLoc &DL) {
54242 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54243 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54244 if (!ValidSrlConst)
54245 return SDValue();
54246 unsigned SrlConstVal = *ValidSrlConst;
54247
54248 SDValue Op = N.getOperand(0);
54249 unsigned Opcode = Op.getOpcode();
54250 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54251 "Illegal truncation types");
54252
54253 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54254 !isa<ConstantSDNode>(Op.getOperand(1)))
54255 return SDValue();
54256 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54257
54258 if (SrlConstVal <= 32 ||
54259 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54260 return SDValue();
54261
54262 SDValue OpLhsSrl =
54263 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54264 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54265
54266 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54267 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54268 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54269
54270 if (Opcode == ISD::ADD) {
54271 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54272 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54273 }
54274 return NewOpNode;
54275}
54276
54277/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54278/// the codegen.
54279/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54280/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54281/// anything that is guaranteed to be transformed by DAGCombiner.
54283 const X86Subtarget &Subtarget,
54284 const SDLoc &DL) {
54285 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54286 SDValue Src = N->getOperand(0);
54287 unsigned SrcOpcode = Src.getOpcode();
54288 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54289
54290 EVT VT = N->getValueType(0);
54291 EVT SrcVT = Src.getValueType();
54292
54293 auto IsFreeTruncation = [VT](SDValue Op) {
54294 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54295
54296 // See if this has been extended from a smaller/equal size to
54297 // the truncation size, allowing a truncation to combine with the extend.
54298 unsigned Opcode = Op.getOpcode();
54299 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54300 Opcode == ISD::ZERO_EXTEND) &&
54301 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54302 return true;
54303
54304 // See if this is a single use constant which can be constant folded.
54305 // NOTE: We don't peek throught bitcasts here because there is currently
54306 // no support for constant folding truncate+bitcast+vector_of_constants. So
54307 // we'll just send up with a truncate on both operands which will
54308 // get turned back into (truncate (binop)) causing an infinite loop.
54309 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54310 };
54311
54312 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54313 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54314 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54315 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54316 };
54317
54318 // Don't combine if the operation has other uses.
54319 if (!Src.hasOneUse())
54320 return SDValue();
54321
54322 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54323 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54324
54325 if (!VT.isVector())
54326 return SDValue();
54327
54328 // In most cases its only worth pre-truncating if we're only facing the cost
54329 // of one truncation.
54330 // i.e. if one of the inputs will constant fold or the input is repeated.
54331 switch (SrcOpcode) {
54332 case ISD::MUL:
54333 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54334 // better to truncate if we have the chance.
54335 if (SrcVT.getScalarType() == MVT::i64 &&
54336 TLI.isOperationLegal(SrcOpcode, VT) &&
54337 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54338 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54339 [[fallthrough]];
54340 case ISD::AND:
54341 case ISD::XOR:
54342 case ISD::OR:
54343 case ISD::ADD:
54344 case ISD::SUB: {
54345 SDValue Op0 = Src.getOperand(0);
54346 SDValue Op1 = Src.getOperand(1);
54347 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54348 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54349 return TruncateArithmetic(Op0, Op1);
54350 break;
54351 }
54352 }
54353
54354 return SDValue();
54355}
54356
54357// Try to form a MULHU or MULHS node by looking for
54358// (trunc (srl (mul ext, ext), >= 16))
54359// TODO: This is X86 specific because we want to be able to handle wide types
54360// before type legalization. But we can only do it if the vector will be
54361// legalized via widening/splitting. Type legalization can't handle promotion
54362// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54363// combiner.
54364static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54365 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54366 using namespace llvm::SDPatternMatch;
54367
54368 if (!Subtarget.hasSSE2())
54369 return SDValue();
54370
54371 // Only handle vXi16 types that are at least 128-bits unless they will be
54372 // widened.
54373 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54374 return SDValue();
54375
54376 // Input type should be at least vXi32.
54377 EVT InVT = Src.getValueType();
54378 if (InVT.getVectorElementType().getSizeInBits() < 32)
54379 return SDValue();
54380
54381 // First instruction should be a right shift by 16 of a multiply.
54382 SDValue LHS, RHS;
54383 APInt ShiftAmt;
54384 if (!sd_match(Src,
54385 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54386 return SDValue();
54387
54388 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54389 return SDValue();
54390
54391 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54392
54393 // Count leading sign/zero bits on both inputs - if there are enough then
54394 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54395 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54396 // truncations may actually be free by peeking through to the ext source.
54397 auto IsSext = [&DAG](SDValue V) {
54398 return DAG.ComputeMaxSignificantBits(V) <= 16;
54399 };
54400 auto IsZext = [&DAG](SDValue V) {
54401 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54402 };
54403
54404 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54405 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54406 if (!IsSigned && !IsUnsigned)
54407 return SDValue();
54408
54409 // Check if both inputs are extensions, which will be removed by truncation.
54410 auto isOpTruncateFree = [](SDValue Op) {
54411 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54412 Op.getOpcode() == ISD::ZERO_EXTEND)
54413 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54414 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54415 };
54416 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54417
54418 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54419 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54420 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54421 // will have to split anyway.
54422 unsigned InSizeInBits = InVT.getSizeInBits();
54423 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54424 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54425 (InSizeInBits % 16) == 0) {
54426 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54427 InVT.getSizeInBits() / 16);
54428 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54429 DAG.getBitcast(BCVT, RHS));
54430 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54431 return DAG.getNode(ISD::SRL, DL, VT, Res,
54432 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54433 }
54434
54435 // Truncate back to source type.
54436 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54437 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54438
54439 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54440 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54441 return DAG.getNode(ISD::SRL, DL, VT, Res,
54442 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54443}
54444
54445// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54446// from one vector with signed bytes from another vector, adds together
54447// adjacent pairs of 16-bit products, and saturates the result before
54448// truncating to 16-bits.
54449//
54450// Which looks something like this:
54451// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54452// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54454 const X86Subtarget &Subtarget,
54455 const SDLoc &DL) {
54456 if (!VT.isVector() || !Subtarget.hasSSSE3())
54457 return SDValue();
54458
54459 unsigned NumElems = VT.getVectorNumElements();
54460 EVT ScalarVT = VT.getVectorElementType();
54461 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54462 return SDValue();
54463
54464 SDValue SSatVal = detectSSatPattern(In, VT);
54465 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54466 return SDValue();
54467
54468 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54469 // of multiplies from even/odd elements.
54470 SDValue N0 = SSatVal.getOperand(0);
54471 SDValue N1 = SSatVal.getOperand(1);
54472
54473 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54474 return SDValue();
54475
54476 SDValue N00 = N0.getOperand(0);
54477 SDValue N01 = N0.getOperand(1);
54478 SDValue N10 = N1.getOperand(0);
54479 SDValue N11 = N1.getOperand(1);
54480
54481 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54482 // Canonicalize zero_extend to LHS.
54483 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54484 std::swap(N00, N01);
54485 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54486 std::swap(N10, N11);
54487
54488 // Ensure we have a zero_extend and a sign_extend.
54489 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54490 N01.getOpcode() != ISD::SIGN_EXTEND ||
54491 N10.getOpcode() != ISD::ZERO_EXTEND ||
54492 N11.getOpcode() != ISD::SIGN_EXTEND)
54493 return SDValue();
54494
54495 // Peek through the extends.
54496 N00 = N00.getOperand(0);
54497 N01 = N01.getOperand(0);
54498 N10 = N10.getOperand(0);
54499 N11 = N11.getOperand(0);
54500
54501 // Ensure the extend is from vXi8.
54502 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54503 N01.getValueType().getVectorElementType() != MVT::i8 ||
54504 N10.getValueType().getVectorElementType() != MVT::i8 ||
54505 N11.getValueType().getVectorElementType() != MVT::i8)
54506 return SDValue();
54507
54508 // All inputs should be build_vectors.
54509 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54510 N01.getOpcode() != ISD::BUILD_VECTOR ||
54511 N10.getOpcode() != ISD::BUILD_VECTOR ||
54513 return SDValue();
54514
54515 // N00/N10 are zero extended. N01/N11 are sign extended.
54516
54517 // For each element, we need to ensure we have an odd element from one vector
54518 // multiplied by the odd element of another vector and the even element from
54519 // one of the same vectors being multiplied by the even element from the
54520 // other vector. So we need to make sure for each element i, this operator
54521 // is being performed:
54522 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54523 SDValue ZExtIn, SExtIn;
54524 for (unsigned i = 0; i != NumElems; ++i) {
54525 SDValue N00Elt = N00.getOperand(i);
54526 SDValue N01Elt = N01.getOperand(i);
54527 SDValue N10Elt = N10.getOperand(i);
54528 SDValue N11Elt = N11.getOperand(i);
54529 // TODO: Be more tolerant to undefs.
54530 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54531 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54532 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54534 return SDValue();
54535 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54536 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54537 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54538 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54539 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54540 return SDValue();
54541 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54542 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54543 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54544 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54545 // Add is commutative so indices can be reordered.
54546 if (IdxN00 > IdxN10) {
54547 std::swap(IdxN00, IdxN10);
54548 std::swap(IdxN01, IdxN11);
54549 }
54550 // N0 indices be the even element. N1 indices must be the next odd element.
54551 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54552 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54553 return SDValue();
54554 SDValue N00In = N00Elt.getOperand(0);
54555 SDValue N01In = N01Elt.getOperand(0);
54556 SDValue N10In = N10Elt.getOperand(0);
54557 SDValue N11In = N11Elt.getOperand(0);
54558 // First time we find an input capture it.
54559 if (!ZExtIn) {
54560 ZExtIn = N00In;
54561 SExtIn = N01In;
54562 }
54563 if (ZExtIn != N00In || SExtIn != N01In ||
54564 ZExtIn != N10In || SExtIn != N11In)
54565 return SDValue();
54566 }
54567
54568 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54569 EVT ExtVT = Ext.getValueType();
54570 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54571 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54572 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54573 DAG.getVectorIdxConstant(0, DL));
54574 }
54575 };
54576 ExtractVec(ZExtIn);
54577 ExtractVec(SExtIn);
54578
54579 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54581 // Shrink by adding truncate nodes and let DAGCombine fold with the
54582 // sources.
54583 EVT InVT = Ops[0].getValueType();
54584 assert(InVT.getScalarType() == MVT::i8 &&
54585 "Unexpected scalar element type");
54586 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54587 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54588 InVT.getVectorNumElements() / 2);
54589 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54590 };
54591 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54592 PMADDBuilder);
54593}
54594
54596 const X86Subtarget &Subtarget) {
54597 EVT VT = N->getValueType(0);
54598 SDValue Src = N->getOperand(0);
54599 SDLoc DL(N);
54600
54601 // Attempt to pre-truncate inputs to arithmetic ops instead.
54602 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54603 return V;
54604
54605 // Try to detect PMADD
54606 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54607 return PMAdd;
54608
54609 // Try to combine truncation with signed/unsigned saturation.
54610 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54611 return Val;
54612
54613 // Try to combine PMULHUW/PMULHW for vXi16.
54614 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54615 return V;
54616
54617 // The bitcast source is a direct mmx result.
54618 // Detect bitcasts between i32 to x86mmx
54619 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54620 SDValue BCSrc = Src.getOperand(0);
54621 if (BCSrc.getValueType() == MVT::x86mmx)
54622 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54623 }
54624
54625 return SDValue();
54626}
54627
54630 EVT VT = N->getValueType(0);
54631 SDValue In = N->getOperand(0);
54632 SDLoc DL(N);
54633
54634 if (SDValue SSatVal = detectSSatPattern(In, VT))
54635 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54636 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54637 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54638
54639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54640 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54641 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54642 return SDValue(N, 0);
54643
54644 return SDValue();
54645}
54646
54647/// Returns the negated value if the node \p N flips sign of FP value.
54648///
54649/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54650/// or FSUB(0, x)
54651/// AVX512F does not have FXOR, so FNEG is lowered as
54652/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54653/// In this case we go though all bitcasts.
54654/// This also recognizes splat of a negated value and returns the splat of that
54655/// value.
54656static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54657 if (N->getOpcode() == ISD::FNEG)
54658 return N->getOperand(0);
54659
54660 // Don't recurse exponentially.
54662 return SDValue();
54663
54664 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54665
54667 EVT VT = Op->getValueType(0);
54668
54669 // Make sure the element size doesn't change.
54670 if (VT.getScalarSizeInBits() != ScalarSize)
54671 return SDValue();
54672
54673 unsigned Opc = Op.getOpcode();
54674 switch (Opc) {
54675 case ISD::VECTOR_SHUFFLE: {
54676 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54677 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54678 if (!Op.getOperand(1).isUndef())
54679 return SDValue();
54680 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54681 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54682 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54683 cast<ShuffleVectorSDNode>(Op)->getMask());
54684 break;
54685 }
54687 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54688 // -V, INDEX).
54689 SDValue InsVector = Op.getOperand(0);
54690 SDValue InsVal = Op.getOperand(1);
54691 if (!InsVector.isUndef())
54692 return SDValue();
54693 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54694 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54695 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54696 NegInsVal, Op.getOperand(2));
54697 break;
54698 }
54699 case ISD::FSUB:
54700 case ISD::XOR:
54701 case X86ISD::FXOR: {
54702 SDValue Op1 = Op.getOperand(1);
54703 SDValue Op0 = Op.getOperand(0);
54704
54705 // For XOR and FXOR, we want to check if constant
54706 // bits of Op1 are sign bit masks. For FSUB, we
54707 // have to check if constant bits of Op0 are sign
54708 // bit masks and hence we swap the operands.
54709 if (Opc == ISD::FSUB)
54710 std::swap(Op0, Op1);
54711
54712 APInt UndefElts;
54713 SmallVector<APInt, 16> EltBits;
54714 // Extract constant bits and see if they are all
54715 // sign bit masks. Ignore the undef elements.
54716 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54717 /* AllowWholeUndefs */ true,
54718 /* AllowPartialUndefs */ false)) {
54719 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54720 if (!UndefElts[I] && !EltBits[I].isSignMask())
54721 return SDValue();
54722
54723 // Only allow bitcast from correctly-sized constant.
54724 Op0 = peekThroughBitcasts(Op0);
54725 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54726 return Op0;
54727 }
54728 break;
54729 } // case
54730 } // switch
54731
54732 return SDValue();
54733}
54734
54735static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54736 bool NegRes) {
54737 if (NegMul) {
54738 switch (Opcode) {
54739 // clang-format off
54740 default: llvm_unreachable("Unexpected opcode");
54741 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54742 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54743 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54744 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54745 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54746 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54747 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54748 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54749 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54750 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54751 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54752 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54753 // clang-format on
54754 }
54755 }
54756
54757 if (NegAcc) {
54758 switch (Opcode) {
54759 // clang-format off
54760 default: llvm_unreachable("Unexpected opcode");
54761 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54762 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54763 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54764 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54765 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54766 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54767 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54768 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54769 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54770 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54771 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54772 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54773 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54774 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54775 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54776 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54777 // clang-format on
54778 }
54779 }
54780
54781 if (NegRes) {
54782 switch (Opcode) {
54783 // For accuracy reason, we never combine fneg and fma under strict FP.
54784 // clang-format off
54785 default: llvm_unreachable("Unexpected opcode");
54786 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54787 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54788 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54789 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54790 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54791 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54792 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54793 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54794 // clang-format on
54795 }
54796 }
54797
54798 return Opcode;
54799}
54800
54801/// Do target-specific dag combines on floating point negations.
54804 const X86Subtarget &Subtarget) {
54805 EVT OrigVT = N->getValueType(0);
54806 SDValue Arg = isFNEG(DAG, N);
54807 if (!Arg)
54808 return SDValue();
54809
54810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54811 EVT VT = Arg.getValueType();
54812 EVT SVT = VT.getScalarType();
54813 SDLoc DL(N);
54814
54815 // Let legalize expand this if it isn't a legal type yet.
54816 if (!TLI.isTypeLegal(VT))
54817 return SDValue();
54818
54819 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54820 // use of a constant by performing (-0 - A*B) instead.
54821 // FIXME: Check rounding control flags as well once it becomes available.
54822 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54823 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54824 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54825 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54826 Arg.getOperand(1), Zero);
54827 return DAG.getBitcast(OrigVT, NewNode);
54828 }
54829
54831 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54832 if (SDValue NegArg =
54833 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54834 return DAG.getBitcast(OrigVT, NegArg);
54835
54836 return SDValue();
54837}
54838
54840 bool LegalOperations,
54841 bool ForCodeSize,
54843 unsigned Depth) const {
54844 // fneg patterns are removable even if they have multiple uses.
54845 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54847 return DAG.getBitcast(Op.getValueType(), Arg);
54848 }
54849
54850 EVT VT = Op.getValueType();
54851 EVT SVT = VT.getScalarType();
54852 unsigned Opc = Op.getOpcode();
54853 SDNodeFlags Flags = Op.getNode()->getFlags();
54854 switch (Opc) {
54855 case ISD::FMA:
54856 case X86ISD::FMSUB:
54857 case X86ISD::FNMADD:
54858 case X86ISD::FNMSUB:
54859 case X86ISD::FMADD_RND:
54860 case X86ISD::FMSUB_RND:
54861 case X86ISD::FNMADD_RND:
54862 case X86ISD::FNMSUB_RND: {
54863 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54864 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54866 break;
54867
54868 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54869 // if it may have signed zeros.
54870 if (!Flags.hasNoSignedZeros())
54871 break;
54872
54873 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54874 // keep temporary nodes alive.
54875 std::list<HandleSDNode> Handles;
54876
54877 // This is always negatible for free but we might be able to remove some
54878 // extra operand negations as well.
54879 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54880 for (int i = 0; i != 3; ++i) {
54881 NewOps[i] = getCheaperNegatedExpression(
54882 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54883 if (!!NewOps[i])
54884 Handles.emplace_back(NewOps[i]);
54885 }
54886
54887 bool NegA = !!NewOps[0];
54888 bool NegB = !!NewOps[1];
54889 bool NegC = !!NewOps[2];
54890 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54891
54892 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54894
54895 // Fill in the non-negated ops with the original values.
54896 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54897 if (!NewOps[i])
54898 NewOps[i] = Op.getOperand(i);
54899 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54900 }
54901 case X86ISD::FRCP:
54902 if (SDValue NegOp0 =
54903 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54904 ForCodeSize, Cost, Depth + 1))
54905 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54906 break;
54907 }
54908
54909 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54910 ForCodeSize, Cost, Depth);
54911}
54912
54914 const X86Subtarget &Subtarget) {
54915 MVT VT = N->getSimpleValueType(0);
54916 // If we have integer vector types available, use the integer opcodes.
54917 if (!VT.isVector() || !Subtarget.hasSSE2())
54918 return SDValue();
54919
54920 SDLoc dl(N);
54922 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54923 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54924 unsigned IntOpcode;
54925 switch (N->getOpcode()) {
54926 // clang-format off
54927 default: llvm_unreachable("Unexpected FP logic op");
54928 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54929 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54930 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54931 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54932 // clang-format on
54933 }
54934 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54935 return DAG.getBitcast(VT, IntOp);
54936}
54937
54938/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54940 if (N->getOpcode() != ISD::XOR)
54941 return SDValue();
54942
54943 SDValue LHS = N->getOperand(0);
54944 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54945 return SDValue();
54946
54948 X86::CondCode(LHS->getConstantOperandVal(0)));
54949 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54950}
54951
54953 const X86Subtarget &Subtarget) {
54954 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54955 "Invalid opcode for combing with CTLZ");
54956 if (Subtarget.hasFastLZCNT())
54957 return SDValue();
54958
54959 EVT VT = N->getValueType(0);
54960 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54961 (VT != MVT::i64 || !Subtarget.is64Bit()))
54962 return SDValue();
54963
54964 SDValue N0 = N->getOperand(0);
54965 SDValue N1 = N->getOperand(1);
54966
54967 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54969 return SDValue();
54970
54971 SDValue OpCTLZ;
54972 SDValue OpSizeTM1;
54973
54974 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54975 OpCTLZ = N1;
54976 OpSizeTM1 = N0;
54977 } else if (N->getOpcode() == ISD::SUB) {
54978 return SDValue();
54979 } else {
54980 OpCTLZ = N0;
54981 OpSizeTM1 = N1;
54982 }
54983
54984 if (!OpCTLZ.hasOneUse())
54985 return SDValue();
54986 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54987 if (!C)
54988 return SDValue();
54989
54990 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54991 return SDValue();
54992 EVT OpVT = VT;
54993 SDValue Op = OpCTLZ.getOperand(0);
54994 if (VT == MVT::i8) {
54995 // Zero extend to i32 since there is not an i8 bsr.
54996 OpVT = MVT::i32;
54997 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54998 }
54999
55000 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55001 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55002 if (VT == MVT::i8)
55003 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55004
55005 return Op;
55006}
55007
55010 const X86Subtarget &Subtarget) {
55011 SDValue N0 = N->getOperand(0);
55012 SDValue N1 = N->getOperand(1);
55013 EVT VT = N->getValueType(0);
55014 SDLoc DL(N);
55015
55016 // If this is SSE1 only convert to FXOR to avoid scalarization.
55017 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55018 return DAG.getBitcast(MVT::v4i32,
55019 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55020 DAG.getBitcast(MVT::v4f32, N0),
55021 DAG.getBitcast(MVT::v4f32, N1)));
55022 }
55023
55024 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55025 return Cmp;
55026
55027 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55028 return R;
55029
55030 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55031 return R;
55032
55033 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55034 return R;
55035
55036 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55037 DAG, DCI, Subtarget))
55038 return FPLogic;
55039
55040 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55041 return R;
55042
55043 if (DCI.isBeforeLegalizeOps())
55044 return SDValue();
55045
55046 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55047 return SetCC;
55048
55049 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55050 return R;
55051
55052 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55053 return RV;
55054
55055 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55056 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55057 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55058 N0.getOperand(0).getValueType().isVector() &&
55059 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55060 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55061 return DAG.getBitcast(
55062 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55063 }
55064
55065 // Handle AVX512 mask widening.
55066 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55067 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55068 VT.getVectorElementType() == MVT::i1 &&
55070 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55071 return DAG.getNode(
55073 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55074 N0.getOperand(2));
55075 }
55076
55077 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55078 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55079 // TODO: Under what circumstances could this be performed in DAGCombine?
55080 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55081 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55082 SDValue TruncExtSrc = N0.getOperand(0);
55083 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55084 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55085 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55086 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55087 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55088 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55089 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55090 }
55091 }
55092
55093 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55094 return R;
55095
55096 return combineFneg(N, DAG, DCI, Subtarget);
55097}
55098
55101 const X86Subtarget &Subtarget) {
55102 SDValue N0 = N->getOperand(0);
55103 EVT VT = N->getValueType(0);
55104
55105 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55106 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55107 SDValue Src = N0.getOperand(0);
55108 EVT SrcVT = Src.getValueType();
55109 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55110 (DCI.isBeforeLegalize() ||
55111 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55112 Subtarget.hasSSSE3()) {
55113 unsigned NumElts = SrcVT.getVectorNumElements();
55114 SmallVector<int, 32> ReverseMask(NumElts);
55115 for (unsigned I = 0; I != NumElts; ++I)
55116 ReverseMask[I] = (NumElts - 1) - I;
55117 SDValue Rev =
55118 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55119 return DAG.getBitcast(VT, Rev);
55120 }
55121 }
55122
55123 return SDValue();
55124}
55125
55126// Various combines to try to convert to avgceilu.
55129 const X86Subtarget &Subtarget) {
55130 unsigned Opcode = N->getOpcode();
55131 SDValue N0 = N->getOperand(0);
55132 SDValue N1 = N->getOperand(1);
55133 EVT VT = N->getValueType(0);
55134 EVT SVT = VT.getScalarType();
55135 SDLoc DL(N);
55136
55137 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55138 // Only useful on vXi8 which doesn't have good SRA handling.
55139 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55141 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55142 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55143 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55144 return DAG.getNode(ISD::XOR, DL, VT,
55145 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55146 }
55147
55148 return SDValue();
55149}
55150
55153 const X86Subtarget &Subtarget) {
55154 EVT VT = N->getValueType(0);
55155 unsigned NumBits = VT.getSizeInBits();
55156
55157 // TODO - Constant Folding.
55158
55159 // Simplify the inputs.
55160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55161 APInt DemandedMask(APInt::getAllOnes(NumBits));
55162 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55163 return SDValue(N, 0);
55164
55165 return SDValue();
55166}
55167
55169 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55170}
55171
55172/// If a value is a scalar FP zero or a vector FP zero (potentially including
55173/// undefined elements), return a zero constant that may be used to fold away
55174/// that value. In the case of a vector, the returned constant will not contain
55175/// undefined elements even if the input parameter does. This makes it suitable
55176/// to be used as a replacement operand with operations (eg, bitwise-and) where
55177/// an undef should not propagate.
55179 const X86Subtarget &Subtarget) {
55181 return SDValue();
55182
55183 if (V.getValueType().isVector())
55184 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55185
55186 return V;
55187}
55188
55190 const X86Subtarget &Subtarget) {
55191 SDValue N0 = N->getOperand(0);
55192 SDValue N1 = N->getOperand(1);
55193 EVT VT = N->getValueType(0);
55194 SDLoc DL(N);
55195
55196 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55197 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55198 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55199 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55200 return SDValue();
55201
55202 auto isAllOnesConstantFP = [](SDValue V) {
55203 if (V.getSimpleValueType().isVector())
55204 return ISD::isBuildVectorAllOnes(V.getNode());
55205 auto *C = dyn_cast<ConstantFPSDNode>(V);
55206 return C && C->getConstantFPValue()->isAllOnesValue();
55207 };
55208
55209 // fand (fxor X, -1), Y --> fandn X, Y
55210 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55211 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55212
55213 // fand X, (fxor Y, -1) --> fandn Y, X
55214 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55215 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55216
55217 return SDValue();
55218}
55219
55220/// Do target-specific dag combines on X86ISD::FAND nodes.
55222 const X86Subtarget &Subtarget) {
55223 // FAND(0.0, x) -> 0.0
55224 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55225 return V;
55226
55227 // FAND(x, 0.0) -> 0.0
55228 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55229 return V;
55230
55231 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55232 return V;
55233
55234 return lowerX86FPLogicOp(N, DAG, Subtarget);
55235}
55236
55237/// Do target-specific dag combines on X86ISD::FANDN nodes.
55239 const X86Subtarget &Subtarget) {
55240 // FANDN(0.0, x) -> x
55241 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55242 return N->getOperand(1);
55243
55244 // FANDN(x, 0.0) -> 0.0
55245 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55246 return V;
55247
55248 return lowerX86FPLogicOp(N, DAG, Subtarget);
55249}
55250
55251/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55254 const X86Subtarget &Subtarget) {
55255 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55256
55257 // F[X]OR(0.0, x) -> x
55258 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55259 return N->getOperand(1);
55260
55261 // F[X]OR(x, 0.0) -> x
55262 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55263 return N->getOperand(0);
55264
55265 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55266 return NewVal;
55267
55268 return lowerX86FPLogicOp(N, DAG, Subtarget);
55269}
55270
55271/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55273 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55274
55275 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55276 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55278 return SDValue();
55279
55280 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55281 // into FMINC and FMAXC, which are Commutative operations.
55282 unsigned NewOp = 0;
55283 switch (N->getOpcode()) {
55284 default: llvm_unreachable("unknown opcode");
55285 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55286 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55287 }
55288
55289 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55290 N->getOperand(0), N->getOperand(1));
55291}
55292
55294 const X86Subtarget &Subtarget) {
55295 EVT VT = N->getValueType(0);
55296 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55297 return SDValue();
55298
55299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55300
55301 auto IsMinMaxLegal = [&](EVT VT) {
55302 if (!TLI.isTypeLegal(VT))
55303 return false;
55304 return VT.getScalarType() != MVT::f16 ||
55305 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55306 };
55307
55308 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55309 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55310 (Subtarget.hasFP16() && VT == MVT::f16) ||
55311 (VT.isVector() && IsMinMaxLegal(VT))))
55312 return SDValue();
55313
55314 SDValue Op0 = N->getOperand(0);
55315 SDValue Op1 = N->getOperand(1);
55316 SDLoc DL(N);
55317 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55318
55319 // If we don't have to respect NaN inputs, this is a direct translation to x86
55320 // min/max instructions.
55321 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55322 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55323
55324 // If one of the operands is known non-NaN use the native min/max instructions
55325 // with the non-NaN input as second operand.
55326 if (DAG.isKnownNeverNaN(Op1))
55327 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55328 if (DAG.isKnownNeverNaN(Op0))
55329 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55330
55331 // If we have to respect NaN inputs, this takes at least 3 instructions.
55332 // Favor a library call when operating on a scalar and minimizing code size.
55333 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55334 return SDValue();
55335
55336 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55337 VT);
55338
55339 // There are 4 possibilities involving NaN inputs, and these are the required
55340 // outputs:
55341 // Op1
55342 // Num NaN
55343 // ----------------
55344 // Num | Max | Op0 |
55345 // Op0 ----------------
55346 // NaN | Op1 | NaN |
55347 // ----------------
55348 //
55349 // The SSE FP max/min instructions were not designed for this case, but rather
55350 // to implement:
55351 // Min = Op1 < Op0 ? Op1 : Op0
55352 // Max = Op1 > Op0 ? Op1 : Op0
55353 //
55354 // So they always return Op0 if either input is a NaN. However, we can still
55355 // use those instructions for fmaxnum by selecting away a NaN input.
55356
55357 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55358 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55359 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55360
55361 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55362 // are NaN, the NaN value of Op1 is the result.
55363 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55364}
55365
55368 EVT VT = N->getValueType(0);
55369 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55370
55371 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55372 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55373 return SDValue(N, 0);
55374
55375 // Convert a full vector load into vzload when not all bits are needed.
55376 SDValue In = N->getOperand(0);
55377 MVT InVT = In.getSimpleValueType();
55378 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55379 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55380 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55381 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55382 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55383 MVT MemVT = MVT::getIntegerVT(NumBits);
55384 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55385 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55386 SDLoc dl(N);
55387 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55388 DAG.getBitcast(InVT, VZLoad));
55389 DCI.CombineTo(N, Convert);
55390 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55392 return SDValue(N, 0);
55393 }
55394 }
55395
55396 return SDValue();
55397}
55398
55402 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55403 EVT VT = N->getValueType(0);
55404
55405 // Convert a full vector load into vzload when not all bits are needed.
55406 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55407 MVT InVT = In.getSimpleValueType();
55408 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55409 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55410 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55411 LoadSDNode *LN = cast<LoadSDNode>(In);
55412 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55413 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55414 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55415 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55416 SDLoc dl(N);
55417 if (IsStrict) {
55418 SDValue Convert =
55419 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55420 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55421 DCI.CombineTo(N, Convert, Convert.getValue(1));
55422 } else {
55423 SDValue Convert =
55424 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55425 DCI.CombineTo(N, Convert);
55426 }
55427 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55429 return SDValue(N, 0);
55430 }
55431 }
55432
55433 return SDValue();
55434}
55435
55436/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55439 const X86Subtarget &Subtarget) {
55440 SDValue N0 = N->getOperand(0);
55441 SDValue N1 = N->getOperand(1);
55442 MVT VT = N->getSimpleValueType(0);
55443 int NumElts = VT.getVectorNumElements();
55444 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55445 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55446 SDLoc DL(N);
55447
55448 // ANDNP(undef, x) -> 0
55449 // ANDNP(x, undef) -> 0
55450 if (N0.isUndef() || N1.isUndef())
55451 return DAG.getConstant(0, DL, VT);
55452
55453 // ANDNP(0, x) -> x
55455 return N1;
55456
55457 // ANDNP(x, 0) -> 0
55459 return DAG.getConstant(0, DL, VT);
55460
55461 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55463 return DAG.getNOT(DL, N0, VT);
55464
55465 // Turn ANDNP back to AND if input is inverted.
55466 if (SDValue Not = IsNOT(N0, DAG))
55467 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55468
55469 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55470 // to make use of predicated selects.
55471 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55472 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55473 SDValue Src = N0.getOperand(0);
55474 EVT SrcVT = Src.getValueType();
55475 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55476 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55477 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55478 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55479 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55480 getZeroVector(VT, Subtarget, DAG, DL));
55481 }
55482
55483 // Constant Folding
55484 APInt Undefs0, Undefs1;
55485 SmallVector<APInt> EltBits0, EltBits1;
55486 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55487 /*AllowWholeUndefs*/ true,
55488 /*AllowPartialUndefs*/ true)) {
55489 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55490 /*AllowWholeUndefs*/ true,
55491 /*AllowPartialUndefs*/ true)) {
55492 SmallVector<APInt> ResultBits;
55493 for (int I = 0; I != NumElts; ++I)
55494 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55495 return getConstVector(ResultBits, VT, DAG, DL);
55496 }
55497
55498 // Constant fold NOT(N0) to allow us to use AND.
55499 // Ensure this is only performed if we can confirm that the bitcasted source
55500 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55501 if (N0->hasOneUse()) {
55503 if (BC0.getOpcode() != ISD::BITCAST) {
55504 for (APInt &Elt : EltBits0)
55505 Elt = ~Elt;
55506 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55507 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55508 }
55509 }
55510 }
55511
55512 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55513 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55514 SDValue Op(N, 0);
55515 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55516 return Res;
55517
55518 // If either operand is a constant mask, then only the elements that aren't
55519 // zero are actually demanded by the other operand.
55520 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55521 APInt UndefElts;
55522 SmallVector<APInt> EltBits;
55523 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55524 APInt DemandedElts = APInt::getAllOnes(NumElts);
55525 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55526 EltBits)) {
55527 DemandedBits.clearAllBits();
55528 DemandedElts.clearAllBits();
55529 for (int I = 0; I != NumElts; ++I) {
55530 if (UndefElts[I]) {
55531 // We can't assume an undef src element gives an undef dst - the
55532 // other src might be zero.
55533 DemandedBits.setAllBits();
55534 DemandedElts.setBit(I);
55535 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55536 (!Invert && !EltBits[I].isZero())) {
55537 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55538 DemandedElts.setBit(I);
55539 }
55540 }
55541 }
55542 return std::make_pair(DemandedBits, DemandedElts);
55543 };
55544 APInt Bits0, Elts0;
55545 APInt Bits1, Elts1;
55546 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55547 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55548
55549 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55550 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55551 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55552 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55553 if (N->getOpcode() != ISD::DELETED_NODE)
55554 DCI.AddToWorklist(N);
55555 return SDValue(N, 0);
55556 }
55557 }
55558
55559 // Folds for better commutativity:
55560 if (N1->hasOneUse()) {
55561 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55562 if (SDValue Not = IsNOT(N1, DAG))
55563 return DAG.getNOT(
55564 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55565
55566 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55567 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55568 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55570 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55571 EVT ShufVT = BC1.getValueType();
55572 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55573 DAG.getBitcast(ShufVT, N0));
55574 SDValue NewShuf =
55575 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55576 return DAG.getBitcast(VT, NewShuf);
55577 }
55578 }
55579 }
55580
55581 return SDValue();
55582}
55583
55586 SDValue N1 = N->getOperand(1);
55587
55588 // BT ignores high bits in the bit index operand.
55589 unsigned BitWidth = N1.getValueSizeInBits();
55591 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55592 if (N->getOpcode() != ISD::DELETED_NODE)
55593 DCI.AddToWorklist(N);
55594 return SDValue(N, 0);
55595 }
55596
55597 return SDValue();
55598}
55599
55602 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55603 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55604
55605 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55607 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55608 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55609 if (N->getOpcode() != ISD::DELETED_NODE)
55610 DCI.AddToWorklist(N);
55611 return SDValue(N, 0);
55612 }
55613
55614 // Convert a full vector load into vzload when not all bits are needed.
55615 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55616 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55617 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55618 SDLoc dl(N);
55619 if (IsStrict) {
55620 SDValue Convert = DAG.getNode(
55621 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55622 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55623 DCI.CombineTo(N, Convert, Convert.getValue(1));
55624 } else {
55625 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55626 DAG.getBitcast(MVT::v8i16, VZLoad));
55627 DCI.CombineTo(N, Convert);
55628 }
55629
55630 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55632 return SDValue(N, 0);
55633 }
55634 }
55635 }
55636
55637 return SDValue();
55638}
55639
55640// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55642 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55643
55644 EVT DstVT = N->getValueType(0);
55645
55646 SDValue N0 = N->getOperand(0);
55647 SDValue N1 = N->getOperand(1);
55648 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55649
55650 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55651 return SDValue();
55652
55653 // Look through single use any_extends / truncs.
55654 SDValue IntermediateBitwidthOp;
55655 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55656 N0.hasOneUse()) {
55657 IntermediateBitwidthOp = N0;
55658 N0 = N0.getOperand(0);
55659 }
55660
55661 // See if we have a single use cmov.
55662 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55663 return SDValue();
55664
55665 SDValue CMovOp0 = N0.getOperand(0);
55666 SDValue CMovOp1 = N0.getOperand(1);
55667
55668 // Make sure both operands are constants.
55669 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55670 !isa<ConstantSDNode>(CMovOp1.getNode()))
55671 return SDValue();
55672
55673 SDLoc DL(N);
55674
55675 // If we looked through an any_extend/trunc above, add one to the constants.
55676 if (IntermediateBitwidthOp) {
55677 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55678 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55679 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55680 }
55681
55682 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55683 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55684
55685 EVT CMovVT = DstVT;
55686 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55687 if (DstVT == MVT::i16) {
55688 CMovVT = MVT::i32;
55689 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55690 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55691 }
55692
55693 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55694 N0.getOperand(2), N0.getOperand(3));
55695
55696 if (CMovVT != DstVT)
55697 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55698
55699 return CMov;
55700}
55701
55703 const X86Subtarget &Subtarget) {
55704 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55705
55706 if (SDValue V = combineSextInRegCmov(N, DAG))
55707 return V;
55708
55709 EVT VT = N->getValueType(0);
55710 SDValue N0 = N->getOperand(0);
55711 SDValue N1 = N->getOperand(1);
55712 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55713 SDLoc dl(N);
55714
55715 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55716 // both SSE and AVX2 since there is no sign-extended shift right
55717 // operation on a vector with 64-bit elements.
55718 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55719 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55720 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55721 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55722 SDValue N00 = N0.getOperand(0);
55723
55724 // EXTLOAD has a better solution on AVX2,
55725 // it may be replaced with X86ISD::VSEXT node.
55726 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55727 if (!ISD::isNormalLoad(N00.getNode()))
55728 return SDValue();
55729
55730 // Attempt to promote any comparison mask ops before moving the
55731 // SIGN_EXTEND_INREG in the way.
55732 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55733 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55734
55735 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55736 SDValue Tmp =
55737 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55738 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55739 }
55740 }
55741 return SDValue();
55742}
55743
55744/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55745/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55746/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55747/// opportunities to combine math ops, use an LEA, or use a complex addressing
55748/// mode. This can eliminate extend, add, and shift instructions.
55750 const X86Subtarget &Subtarget) {
55751 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55752 Ext->getOpcode() != ISD::ZERO_EXTEND)
55753 return SDValue();
55754
55755 // TODO: This should be valid for other integer types.
55756 EVT VT = Ext->getValueType(0);
55757 if (VT != MVT::i64)
55758 return SDValue();
55759
55760 SDValue Add = Ext->getOperand(0);
55761 if (Add.getOpcode() != ISD::ADD)
55762 return SDValue();
55763
55764 SDValue AddOp0 = Add.getOperand(0);
55765 SDValue AddOp1 = Add.getOperand(1);
55766 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55767 bool NSW = Add->getFlags().hasNoSignedWrap();
55768 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55769 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55770 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55771
55772 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55773 // into the 'zext'
55774 if ((Sext && !NSW) || (!Sext && !NUW))
55775 return SDValue();
55776
55777 // Having a constant operand to the 'add' ensures that we are not increasing
55778 // the instruction count because the constant is extended for free below.
55779 // A constant operand can also become the displacement field of an LEA.
55780 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55781 if (!AddOp1C)
55782 return SDValue();
55783
55784 // Don't make the 'add' bigger if there's no hope of combining it with some
55785 // other 'add' or 'shl' instruction.
55786 // TODO: It may be profitable to generate simpler LEA instructions in place
55787 // of single 'add' instructions, but the cost model for selecting an LEA
55788 // currently has a high threshold.
55789 bool HasLEAPotential = false;
55790 for (auto *User : Ext->users()) {
55791 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55792 HasLEAPotential = true;
55793 break;
55794 }
55795 }
55796 if (!HasLEAPotential)
55797 return SDValue();
55798
55799 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55800 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55801 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55802 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55803
55804 // The wider add is guaranteed to not wrap because both operands are
55805 // sign-extended.
55806 SDNodeFlags Flags;
55807 Flags.setNoSignedWrap(NSW);
55808 Flags.setNoUnsignedWrap(NUW);
55809 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55810}
55811
55812// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55813// operands and the result of CMOV is not used anywhere else - promote CMOV
55814// itself instead of promoting its result. This could be beneficial, because:
55815// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55816// (or more) pseudo-CMOVs only when they go one-after-another and
55817// getting rid of result extension code after CMOV will help that.
55818// 2) Promotion of constant CMOV arguments is free, hence the
55819// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55820// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55821// promotion is also good in terms of code-size.
55822// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55823// promotion).
55825 SDValue CMovN = Extend->getOperand(0);
55826 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55827 return SDValue();
55828
55829 EVT TargetVT = Extend->getValueType(0);
55830 unsigned ExtendOpcode = Extend->getOpcode();
55831 SDLoc DL(Extend);
55832
55833 EVT VT = CMovN.getValueType();
55834 SDValue CMovOp0 = CMovN.getOperand(0);
55835 SDValue CMovOp1 = CMovN.getOperand(1);
55836
55837 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55838 !isa<ConstantSDNode>(CMovOp1.getNode()))
55839 return SDValue();
55840
55841 // Only extend to i32 or i64.
55842 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55843 return SDValue();
55844
55845 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55846 // are free.
55847 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55848 return SDValue();
55849
55850 // If this a zero extend to i64, we should only extend to i32 and use a free
55851 // zero extend to finish.
55852 EVT ExtendVT = TargetVT;
55853 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55854 ExtendVT = MVT::i32;
55855
55856 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55857 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55858
55859 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55860 CMovN.getOperand(2), CMovN.getOperand(3));
55861
55862 // Finish extending if needed.
55863 if (ExtendVT != TargetVT)
55864 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55865
55866 return Res;
55867}
55868
55869// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55870// result type.
55872 const X86Subtarget &Subtarget) {
55873 SDValue N0 = N->getOperand(0);
55874 EVT VT = N->getValueType(0);
55875 SDLoc dl(N);
55876
55877 // Only do this combine with AVX512 for vector extends.
55878 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55879 return SDValue();
55880
55881 // Only combine legal element types.
55882 EVT SVT = VT.getVectorElementType();
55883 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55884 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55885 return SDValue();
55886
55887 // We don't have CMPP Instruction for vxf16
55888 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55889 return SDValue();
55890 // We can only do this if the vector size in 256 bits or less.
55891 unsigned Size = VT.getSizeInBits();
55892 if (Size > 256 && Subtarget.useAVX512Regs())
55893 return SDValue();
55894
55895 EVT N00VT = N0.getOperand(0).getValueType();
55896
55897 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55898 // that's the only integer compares with we have.
55900 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55901 return SDValue();
55902
55903 // Only do this combine if the extension will be fully consumed by the setcc.
55904 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55905 if (Size != MatchingVecType.getSizeInBits())
55906 return SDValue();
55907
55908 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55909
55910 if (N->getOpcode() == ISD::ZERO_EXTEND)
55911 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55912
55913 return Res;
55914}
55915
55918 const X86Subtarget &Subtarget) {
55919 SDValue N0 = N->getOperand(0);
55920 EVT VT = N->getValueType(0);
55921 SDLoc DL(N);
55922
55923 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55924 if (!DCI.isBeforeLegalizeOps() &&
55926 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55927 N0->getOperand(1));
55928 bool ReplaceOtherUses = !N0.hasOneUse();
55929 DCI.CombineTo(N, Setcc);
55930 // Replace other uses with a truncate of the widened setcc_carry.
55931 if (ReplaceOtherUses) {
55932 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55933 N0.getValueType(), Setcc);
55934 DCI.CombineTo(N0.getNode(), Trunc);
55935 }
55936
55937 return SDValue(N, 0);
55938 }
55939
55940 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55941 return NewCMov;
55942
55943 if (!DCI.isBeforeLegalizeOps())
55944 return SDValue();
55945
55946 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55947 return V;
55948
55949 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55950 DAG, DCI, Subtarget))
55951 return V;
55952
55953 if (VT.isVector()) {
55954 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55955 return R;
55956
55958 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55959 }
55960
55961 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55962 return NewAdd;
55963
55964 return SDValue();
55965}
55966
55967// Inverting a constant vector is profitable if it can be eliminated and the
55968// inverted vector is already present in DAG. Otherwise, it will be loaded
55969// anyway.
55970//
55971// We determine which of the values can be completely eliminated and invert it.
55972// If both are eliminable, select a vector with the first negative element.
55975 "ConstantFP build vector expected");
55976 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55977 // can eliminate it. Since this function is invoked for each FMA with this
55978 // vector.
55979 auto IsNotFMA = [](SDNode *User) {
55980 return User->getOpcode() != ISD::FMA &&
55981 User->getOpcode() != ISD::STRICT_FMA;
55982 };
55983 if (llvm::any_of(V->users(), IsNotFMA))
55984 return SDValue();
55985
55987 EVT VT = V.getValueType();
55988 EVT EltVT = VT.getVectorElementType();
55989 for (const SDValue &Op : V->op_values()) {
55990 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55991 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55992 } else {
55993 assert(Op.isUndef());
55994 Ops.push_back(DAG.getUNDEF(EltVT));
55995 }
55996 }
55997
55999 if (!NV)
56000 return SDValue();
56001
56002 // If an inverted version cannot be eliminated, choose it instead of the
56003 // original version.
56004 if (llvm::any_of(NV->users(), IsNotFMA))
56005 return SDValue(NV, 0);
56006
56007 // If the inverted version also can be eliminated, we have to consistently
56008 // prefer one of the values. We prefer a constant with a negative value on
56009 // the first place.
56010 // N.B. We need to skip undefs that may precede a value.
56011 for (const SDValue &Op : V->op_values()) {
56012 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56013 if (Cst->isNegative())
56014 return SDValue();
56015 break;
56016 }
56017 }
56018 return SDValue(NV, 0);
56019}
56020
56023 const X86Subtarget &Subtarget) {
56024 SDLoc dl(N);
56025 EVT VT = N->getValueType(0);
56027 bool IsStrict = N->isTargetOpcode()
56028 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56029 : N->isStrictFPOpcode();
56030
56031 // Let legalize expand this if it isn't a legal type yet.
56032 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56033 if (!TLI.isTypeLegal(VT))
56034 return SDValue();
56035
56036 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56037 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56038 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56039
56040 // If the operation allows fast-math and the target does not support FMA,
56041 // split this into mul+add to avoid libcall(s).
56042 SDNodeFlags Flags = N->getFlags();
56043 if (!IsStrict && Flags.hasAllowReassociation() &&
56044 TLI.isOperationExpand(ISD::FMA, VT)) {
56045 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56046 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56047 }
56048
56049 EVT ScalarVT = VT.getScalarType();
56050 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56051 !Subtarget.hasAnyFMA()) &&
56052 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56053 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56054 return SDValue();
56055
56056 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56058 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56059 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56060 CodeSize)) {
56061 V = NegV;
56062 return true;
56063 }
56064 // Look through extract_vector_elts. If it comes from an FNEG, create a
56065 // new extract from the FNEG input.
56066 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56067 isNullConstant(V.getOperand(1))) {
56068 SDValue Vec = V.getOperand(0);
56069 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56070 Vec, DAG, LegalOperations, CodeSize)) {
56071 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56072 NegV, V.getOperand(1));
56073 return true;
56074 }
56075 }
56076 // Lookup if there is an inverted version of constant vector V in DAG.
56077 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56078 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56079 V = NegV;
56080 return true;
56081 }
56082 }
56083 return false;
56084 };
56085
56086 // Do not convert the passthru input of scalar intrinsics.
56087 // FIXME: We could allow negations of the lower element only.
56088 bool NegA = invertIfNegative(A);
56089 // Create a dummy use for A so that in the process of negating B or C
56090 // recursively, it is not deleted.
56091 HandleSDNode NegAHandle(A);
56092 bool NegB = invertIfNegative(B);
56093 // Similar to A, get a handle on B.
56094 HandleSDNode NegBHandle(B);
56095 bool NegC = invertIfNegative(C);
56096
56097 if (!NegA && !NegB && !NegC)
56098 return SDValue();
56099
56100 unsigned NewOpcode =
56101 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56102
56103 // Propagate fast-math-flags to new FMA node.
56104 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56105 if (IsStrict) {
56106 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56107 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56108 {N->getOperand(0), A, B, C});
56109 } else {
56110 if (N->getNumOperands() == 4)
56111 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56112 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56113 }
56114}
56115
56116// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56117// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56120 SDLoc dl(N);
56121 EVT VT = N->getValueType(0);
56122 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56124 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56125
56126 SDValue N2 = N->getOperand(2);
56127
56128 SDValue NegN2 =
56129 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56130 if (!NegN2)
56131 return SDValue();
56132 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56133
56134 if (N->getNumOperands() == 4)
56135 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56136 NegN2, N->getOperand(3));
56137 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56138 NegN2);
56139}
56140
56141// Try to widen the build vector and bitcast it to the type of zext.
56142// This is a special case for the 128-bit vector types. Intention is to remove
56143// the zext and replace it with a bitcast the wider type. While lowering
56144// the bitcast is removed and extra commutation due to zext is avoided.
56145// For example:
56146// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56147// build_vector (x, 0, y, 0, z, w, 0)
56149
56150 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56151 return SDValue();
56152
56153 EVT ExtendVT = Extend->getValueType(0);
56154
56155 SDValue BV = Extend->getOperand(0);
56156 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56157 return SDValue();
56158
56159 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56160 // If the build vector has undef elements, we cannot widen it.
56161 // The widening would create a vector with more undef elements, which
56162 // is not valid.
56163 return SDValue();
56164 }
56165
56166 if (!all_of(BV->op_values(),
56167 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56168 // If the build vector any element other than \ISD::LOAD, we cannot widen
56169 // it.
56170 return SDValue();
56171 }
56172
56173 SDLoc dl(BV);
56174 EVT VT = BV.getValueType();
56175 EVT EltVT = BV.getOperand(0).getValueType();
56176 unsigned NumElts = VT.getVectorNumElements();
56177
56178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56179
56180 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56182 return SDValue();
56183
56184 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56185 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56186
56187 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56188 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56189 // Fill the new elements with Zero.
56190 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56191 // Compute the step to place the elements in the right place and control the
56192 // iteration.
56193 unsigned step = WidenNumElts / NumElts;
56194 if (WidenVT.is128BitVector()) {
56195 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56196 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56197 i--, j -= step) {
56198 SDValue temp = NewOps[i];
56199 NewOps[i] = NewOps[j];
56200 NewOps[j] = temp;
56201 }
56202 // Create new build vector with WidenVT and NewOps
56203 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56204 // Replace the old build vector with the new one. Bitcast the
56205 // new build vector to the type of the zext.
56206 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56207 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56208 return NewBV;
56209 }
56210 }
56211 return SDValue();
56212}
56213
56216 const X86Subtarget &Subtarget) {
56217 SDLoc dl(N);
56218 SDValue N0 = N->getOperand(0);
56219 EVT VT = N->getValueType(0);
56220
56221 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56222 // FIXME: Is this needed? We don't seem to have any tests for it.
56223 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56225 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56226 N0->getOperand(1));
56227 bool ReplaceOtherUses = !N0.hasOneUse();
56228 DCI.CombineTo(N, Setcc);
56229 // Replace other uses with a truncate of the widened setcc_carry.
56230 if (ReplaceOtherUses) {
56231 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56232 N0.getValueType(), Setcc);
56233 DCI.CombineTo(N0.getNode(), Trunc);
56234 }
56235
56236 return SDValue(N, 0);
56237 }
56238
56239 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56240 return NewCMov;
56241
56242 if (DCI.isBeforeLegalizeOps())
56243 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56244 return V;
56245
56246 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56247 DAG, DCI, Subtarget))
56248 return V;
56249
56250 if (VT.isVector())
56251 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56252 return R;
56253
56254 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56255 return NewAdd;
56256
56257 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56258 return R;
56259
56260 // TODO: Combine with any target/faux shuffle.
56261 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56263 SDValue N00 = N0.getOperand(0);
56264 SDValue N01 = N0.getOperand(1);
56265 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56266 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56267 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56268 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56269 return concatSubVectors(N00, N01, DAG, dl);
56270 }
56271 }
56272
56273 if (SDValue V = widenBuildVec(N, DAG))
56274 return V;
56275
56276 return SDValue();
56277}
56278
56279/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56280/// pre-promote its result type since vXi1 vectors don't get promoted
56281/// during type legalization.
56284 const SDLoc &DL, SelectionDAG &DAG,
56285 const X86Subtarget &Subtarget) {
56286 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56287 VT.getVectorElementType() == MVT::i1 &&
56288 (OpVT.getVectorElementType() == MVT::i8 ||
56289 OpVT.getVectorElementType() == MVT::i16)) {
56290 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56291 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56292 }
56293 return SDValue();
56294}
56295
56296// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56297// eq/ne) is generated when using an integer as a mask. Instead of generating a
56298// broadcast + vptest, we can directly move the integer to a mask register.
56300 const SDLoc &DL, SelectionDAG &DAG,
56301 const X86Subtarget &Subtarget) {
56302 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56303 return SDValue();
56304
56305 if (!Subtarget.hasAVX512())
56306 return SDValue();
56307
56308 if (Op0.getOpcode() != ISD::AND)
56309 return SDValue();
56310
56311 SDValue Broadcast = Op0.getOperand(0);
56312 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56313 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56314 return SDValue();
56315
56316 SDValue Load = Op0.getOperand(1);
56317 EVT LoadVT = Load.getSimpleValueType();
56318
56319 APInt UndefElts;
56320 SmallVector<APInt, 32> EltBits;
56322 UndefElts, EltBits,
56323 /*AllowWholeUndefs*/ true,
56324 /*AllowPartialUndefs*/ false) ||
56325 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56326 return SDValue();
56327
56328 // Check if the constant pool contains only powers of 2 starting from some
56329 // 2^N. The table may also contain undefs because of widening of vector
56330 // operands.
56331 unsigned N = EltBits[0].logBase2();
56332 unsigned Len = UndefElts.getBitWidth();
56333 for (unsigned I = 1; I != Len; ++I) {
56334 if (UndefElts[I]) {
56335 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56336 return SDValue();
56337 break;
56338 }
56339
56340 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56341 return SDValue();
56342 }
56343
56344 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56345 SDValue BroadcastOp;
56346 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56347 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56348 Broadcast, DAG.getVectorIdxConstant(0, DL));
56349 } else {
56350 BroadcastOp = Broadcast.getOperand(0);
56351 if (BroadcastOp.getValueType().isVector())
56352 return SDValue();
56353 }
56354
56355 SDValue Masked = BroadcastOp;
56356 if (N != 0) {
56357 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56358 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56359
56360 if (NumDefinedElts > BroadcastOpBitWidth)
56361 return SDValue();
56362
56363 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56364 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56365 DAG.getConstant(N, DL, BroadcastOpVT));
56366 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56367 DAG.getConstant(Mask, DL, BroadcastOpVT));
56368 }
56369 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56370 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56371 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56372 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56373
56374 if (CC == ISD::SETEQ)
56375 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56376
56377 if (VT != MVT::v16i1)
56378 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56379 DAG.getVectorIdxConstant(0, DL));
56380
56381 return Bitcast;
56382}
56383
56386 const X86Subtarget &Subtarget) {
56387 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56388 const SDValue LHS = N->getOperand(0);
56389 const SDValue RHS = N->getOperand(1);
56390 EVT VT = N->getValueType(0);
56391 EVT OpVT = LHS.getValueType();
56392 SDLoc DL(N);
56393
56394 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56395 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56396 Subtarget))
56397 return V;
56398 }
56399
56400 if (VT == MVT::i1) {
56401 X86::CondCode X86CC;
56402 if (SDValue V =
56403 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56404 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56405 }
56406
56407 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56408 if (OpVT.isScalarInteger()) {
56409 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56410 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56411 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56412 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56413 if (N0.getOperand(0) == N1)
56414 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56415 N0.getOperand(1));
56416 if (N0.getOperand(1) == N1)
56417 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56418 N0.getOperand(0));
56419 }
56420 return SDValue();
56421 };
56422 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56423 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56424 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56425 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56426
56427 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56428 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56429 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56430 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56431 if (N0.getOperand(0) == N1)
56432 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56433 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56434 if (N0.getOperand(1) == N1)
56435 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56436 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56437 }
56438 return SDValue();
56439 };
56440 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56441 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56442 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56443 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56444
56445 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56446 // cmpne(trunc(x),C) --> cmpne(x,C)
56447 // iff x upper bits are zero.
56448 if (LHS.getOpcode() == ISD::TRUNCATE &&
56449 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56451 EVT SrcVT = LHS.getOperand(0).getValueType();
56453 OpVT.getScalarSizeInBits());
56454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56455 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56456 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56457 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56458 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56459 }
56460
56461 // With C as a power of 2 and C != 0 and C != INT_MIN:
56462 // icmp eq Abs(X) C ->
56463 // (icmp eq A, C) | (icmp eq A, -C)
56464 // icmp ne Abs(X) C ->
56465 // (icmp ne A, C) & (icmp ne A, -C)
56466 // Both of these patterns can be better optimized in
56467 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56468 // integers which is checked above.
56469 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56470 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56471 const APInt &CInt = C->getAPIntValue();
56472 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56473 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56474 SDValue BaseOp = LHS.getOperand(0);
56475 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56476 SDValue SETCC1 = DAG.getSetCC(
56477 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56478 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56479 SETCC0, SETCC1);
56480 }
56481 }
56482 }
56483 }
56484 }
56485
56486 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56487 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56488 // Using temporaries to avoid messing up operand ordering for later
56489 // transformations if this doesn't work.
56490 SDValue Op0 = LHS;
56491 SDValue Op1 = RHS;
56492 ISD::CondCode TmpCC = CC;
56493 // Put build_vector on the right.
56494 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56495 std::swap(Op0, Op1);
56496 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56497 }
56498
56499 bool IsSEXT0 =
56500 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56501 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56502 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56503
56504 if (IsSEXT0 && IsVZero1) {
56505 assert(VT == Op0.getOperand(0).getValueType() &&
56506 "Unexpected operand type");
56507 if (TmpCC == ISD::SETGT)
56508 return DAG.getConstant(0, DL, VT);
56509 if (TmpCC == ISD::SETLE)
56510 return DAG.getConstant(1, DL, VT);
56511 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56512 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56513
56514 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56515 "Unexpected condition code!");
56516 return Op0.getOperand(0);
56517 }
56518
56519 if (IsVZero1)
56520 if (SDValue V =
56521 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56522 return V;
56523 }
56524
56525 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56526 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56527 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56528 // a mask, there are signed AVX512 comparisons).
56529 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56530 bool CanMakeSigned = false;
56531 if (ISD::isUnsignedIntSetCC(CC)) {
56532 KnownBits CmpKnown =
56534 // If we know LHS/RHS share the same sign bit at each element we can
56535 // make this signed.
56536 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56537 // across all lanes. So a pattern where the sign varies from lane to
56538 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56539 // missed. We could get around this by demanding each lane
56540 // independently, but this isn't the most important optimization and
56541 // that may eat into compile time.
56542 CanMakeSigned =
56543 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56544 }
56545 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56546 SDValue LHSOut = LHS;
56547 SDValue RHSOut = RHS;
56548 ISD::CondCode NewCC = CC;
56549 switch (CC) {
56550 case ISD::SETGE:
56551 case ISD::SETUGE:
56552 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56553 /*NSW*/ true))
56554 LHSOut = NewLHS;
56555 else if (SDValue NewRHS = incDecVectorConstant(
56556 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56557 RHSOut = NewRHS;
56558 else
56559 break;
56560
56561 [[fallthrough]];
56562 case ISD::SETUGT:
56563 NewCC = ISD::SETGT;
56564 break;
56565
56566 case ISD::SETLE:
56567 case ISD::SETULE:
56568 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56569 /*NSW*/ true))
56570 LHSOut = NewLHS;
56571 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56572 /*NSW*/ true))
56573 RHSOut = NewRHS;
56574 else
56575 break;
56576
56577 [[fallthrough]];
56578 case ISD::SETULT:
56579 // Will be swapped to SETGT in LowerVSETCC*.
56580 NewCC = ISD::SETLT;
56581 break;
56582 default:
56583 break;
56584 }
56585 if (NewCC != CC) {
56586 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56587 NewCC, DL, DAG, Subtarget))
56588 return R;
56589 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56590 }
56591 }
56592 }
56593
56594 if (SDValue R =
56595 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56596 return R;
56597
56598 // In the middle end transforms:
56599 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56600 // -> `(icmp ult (add x, -C), 2)`
56601 // Likewise inverted cases with `ugt`.
56602 //
56603 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56604 // in worse codegen. So, undo the middle-end transform and go back to `(or
56605 // (icmp eq), (icmp eq))` form.
56606 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56607 // the xmm approach.
56608 //
56609 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56610 // ne))` as it doesn't end up instruction positive.
56611 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56612 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56613 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56614 !Subtarget.hasAVX512() &&
56615 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56616 Subtarget.hasAVX2()) &&
56617 LHS.hasOneUse()) {
56618
56619 APInt CmpC;
56620 SDValue AddC = LHS.getOperand(1);
56621 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56623 // See which form we have depending on the constant/condition.
56624 SDValue C0 = SDValue();
56625 SDValue C1 = SDValue();
56626
56627 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56628 // we will end up generating an additional constant. Keeping in the
56629 // current form has a slight latency cost, but it probably worth saving a
56630 // constant.
56633 // Pass
56634 }
56635 // Normal Cases
56636 else if ((CC == ISD::SETULT && CmpC == 2) ||
56637 (CC == ISD::SETULE && CmpC == 1)) {
56638 // These will constant fold.
56639 C0 = DAG.getNegative(AddC, DL, OpVT);
56640 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56641 DAG.getAllOnesConstant(DL, OpVT));
56642 }
56643 // Inverted Cases
56644 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56645 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56646 // These will constant fold.
56647 C0 = DAG.getNOT(DL, AddC, OpVT);
56648 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56649 DAG.getAllOnesConstant(DL, OpVT));
56650 }
56651 if (C0 && C1) {
56652 SDValue NewLHS =
56653 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56654 SDValue NewRHS =
56655 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56656 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56657 }
56658 }
56659 }
56660
56661 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56662 // to avoid scalarization via legalization because v4i32 is not a legal type.
56663 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56664 LHS.getValueType() == MVT::v4f32)
56665 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56666
56667 // X pred 0.0 --> X pred -X
56668 // If the negation of X already exists, use it in the comparison. This removes
56669 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56670 // instructions in patterns with a 'select' node.
56672 SDVTList FNegVT = DAG.getVTList(OpVT);
56673 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56674 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56675 }
56676
56677 return SDValue();
56678}
56679
56682 const X86Subtarget &Subtarget) {
56683 SDValue Src = N->getOperand(0);
56684 MVT SrcVT = Src.getSimpleValueType();
56685 MVT VT = N->getSimpleValueType(0);
56686 unsigned NumBits = VT.getScalarSizeInBits();
56687 unsigned NumElts = SrcVT.getVectorNumElements();
56688 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56689 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56690
56691 // Perform constant folding.
56692 APInt UndefElts;
56693 SmallVector<APInt, 32> EltBits;
56694 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56695 /*AllowWholeUndefs*/ true,
56696 /*AllowPartialUndefs*/ true)) {
56697 APInt Imm(32, 0);
56698 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56699 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56700 Imm.setBit(Idx);
56701
56702 return DAG.getConstant(Imm, SDLoc(N), VT);
56703 }
56704
56705 // Look through int->fp bitcasts that don't change the element width.
56706 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56707 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56708 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56709 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56710
56711 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56712 // with scalar comparisons.
56713 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56714 SDLoc DL(N);
56715 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56716 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56717 return DAG.getNode(ISD::XOR, DL, VT,
56718 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56719 DAG.getConstant(NotMask, DL, VT));
56720 }
56721
56722 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56723 // results with scalar comparisons.
56724 if (Src.getOpcode() == X86ISD::PCMPGT &&
56725 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56726 SDLoc DL(N);
56727 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56728 return DAG.getNode(ISD::XOR, DL, VT,
56729 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56730 DAG.getConstant(NotMask, DL, VT));
56731 }
56732
56733 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56734 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56735 // iff pow2splat(c1).
56736 // Use KnownBits to determine if only a single bit is non-zero
56737 // in each element (pow2 or zero), and shift that bit to the msb.
56738 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56739 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56740 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56741 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56742 if (KnownLHS.countMaxPopulation() == 1 &&
56743 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56744 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56745 SDLoc DL(N);
56746 MVT ShiftVT = SrcVT;
56747 SDValue ShiftLHS = Src.getOperand(0);
56748 SDValue ShiftRHS = Src.getOperand(1);
56749 if (ShiftVT.getScalarType() == MVT::i8) {
56750 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56751 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56752 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56753 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56754 }
56755 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56756 ShiftLHS, ShiftAmt, DAG);
56757 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56758 ShiftRHS, ShiftAmt, DAG);
56759 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56760 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56761 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56762 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56763 }
56764 }
56765
56766 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56767 if (N->isOnlyUserOf(Src.getNode())) {
56769 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56770 APInt UndefElts;
56771 SmallVector<APInt, 32> EltBits;
56772 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56773 UndefElts, EltBits)) {
56774 APInt Mask = APInt::getZero(NumBits);
56775 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56776 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56777 Mask.setBit(Idx);
56778 }
56779 SDLoc DL(N);
56780 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56781 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56782 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56783 DAG.getConstant(Mask, DL, VT));
56784 }
56785 }
56786 }
56787
56788 // Simplify the inputs.
56789 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56790 APInt DemandedMask(APInt::getAllOnes(NumBits));
56791 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56792 return SDValue(N, 0);
56793
56794 return SDValue();
56795}
56796
56799 const X86Subtarget &Subtarget) {
56800 MVT VT = N->getSimpleValueType(0);
56801 unsigned NumBits = VT.getScalarSizeInBits();
56802
56803 // Simplify the inputs.
56804 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56805 APInt DemandedMask(APInt::getAllOnes(NumBits));
56806 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56807 return SDValue(N, 0);
56808
56809 return SDValue();
56810}
56811
56815 SDValue Mask = MemOp->getMask();
56816
56817 // With vector masks we only demand the upper bit of the mask.
56818 if (Mask.getScalarValueSizeInBits() != 1) {
56819 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56820 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56821 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56822 if (N->getOpcode() != ISD::DELETED_NODE)
56823 DCI.AddToWorklist(N);
56824 return SDValue(N, 0);
56825 }
56826 }
56827
56828 return SDValue();
56829}
56830
56832 SDValue Index, SDValue Base, SDValue Scale,
56833 SelectionDAG &DAG) {
56834 SDLoc DL(GorS);
56835
56836 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56837 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56838 Gather->getMask(), Base, Index, Scale } ;
56839 return DAG.getMaskedGather(Gather->getVTList(),
56840 Gather->getMemoryVT(), DL, Ops,
56841 Gather->getMemOperand(),
56842 Gather->getIndexType(),
56843 Gather->getExtensionType());
56844 }
56845 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56846 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56847 Scatter->getMask(), Base, Index, Scale };
56848 return DAG.getMaskedScatter(Scatter->getVTList(),
56849 Scatter->getMemoryVT(), DL,
56850 Ops, Scatter->getMemOperand(),
56851 Scatter->getIndexType(),
56852 Scatter->isTruncatingStore());
56853}
56854
56857 SDLoc DL(N);
56858 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56859 SDValue Index = GorS->getIndex();
56860 SDValue Base = GorS->getBasePtr();
56861 SDValue Scale = GorS->getScale();
56862 EVT IndexVT = Index.getValueType();
56863 EVT IndexSVT = IndexVT.getVectorElementType();
56864 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56865 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56866 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56867
56868 if (DCI.isBeforeLegalize()) {
56869 // Attempt to move shifted index into the address scale, allows further
56870 // index truncation below.
56871 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56872 isa<ConstantSDNode>(Scale)) {
56873 unsigned ScaleAmt = Scale->getAsZExtVal();
56874 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56875 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56876 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56877 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56878 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56879 if (N->getOpcode() != ISD::DELETED_NODE)
56880 DCI.AddToWorklist(N);
56881 return SDValue(N, 0);
56882 }
56883 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56884 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56885 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56886 SDValue ShAmt = Index.getOperand(1);
56887 SDValue NewShAmt =
56888 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56889 DAG.getConstant(1, DL, ShAmt.getValueType()));
56890 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56891 Index.getOperand(0), NewShAmt);
56892 SDValue NewScale =
56893 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56894 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56895 }
56896 }
56897 }
56898
56899 // Shrink indices if they are larger than 32-bits.
56900 // Only do this before legalize types since v2i64 could become v2i32.
56901 // FIXME: We could check that the type is legal if we're after legalize
56902 // types, but then we would need to construct test cases where that happens.
56903 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56904 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56905
56906 // FIXME: We could support more than just constant fold, but we need to
56907 // careful with costing. A truncate that can be optimized out would be
56908 // fine. Otherwise we might only want to create a truncate if it avoids
56909 // a split.
56910 if (SDValue TruncIndex =
56911 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56912 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56913
56914 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56915 // there are sufficient sign bits. Only do this before legalize types to
56916 // avoid creating illegal types in truncate.
56917 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56918 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56919 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56920 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56921 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56922 }
56923
56924 // Shrink if we remove an illegal type.
56925 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56926 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56927 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56928 }
56929 }
56930 }
56931
56932 // Try to move splat adders from the index operand to the base
56933 // pointer operand. Taking care to multiply by the scale. We can only do
56934 // this when index element type is the same as the pointer type.
56935 // Otherwise we need to be sure the math doesn't wrap before the scale.
56936 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56937 isa<ConstantSDNode>(Scale)) {
56938 uint64_t ScaleAmt = Scale->getAsZExtVal();
56939
56940 for (unsigned I = 0; I != 2; ++I)
56941 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56942 BitVector UndefElts;
56943 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56944 if (UndefElts.none()) {
56945 // If the splat value is constant we can add the scaled splat value
56946 // to the existing base.
56947 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56948 APInt Adder = C->getAPIntValue() * ScaleAmt;
56949 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56950 DAG.getConstant(Adder, DL, PtrVT));
56951 SDValue NewIndex = Index.getOperand(1 - I);
56952 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56953 }
56954 // For non-constant cases, limit this to non-scaled cases.
56955 if (ScaleAmt == 1) {
56956 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56957 SDValue NewIndex = Index.getOperand(1 - I);
56958 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56959 }
56960 }
56961 }
56962 // It's also possible base is just a constant. In that case, just
56963 // replace it with 0 and move the displacement into the index.
56964 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56965 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56966 // Combine the constant build_vector and the constant base.
56967 Splat =
56968 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56969 // Add to the other half of the original Index add.
56970 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56971 Index.getOperand(1 - I), Splat);
56972 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56973 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56974 }
56975 }
56976 }
56977
56978 if (DCI.isBeforeLegalizeOps()) {
56979 // Make sure the index is either i32 or i64
56980 if (IndexWidth != 32 && IndexWidth != 64) {
56981 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56982 IndexVT = IndexVT.changeVectorElementType(EltVT);
56983 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56984 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56985 }
56986 }
56987
56988 // With vector masks we only demand the upper bit of the mask.
56989 SDValue Mask = GorS->getMask();
56990 if (Mask.getScalarValueSizeInBits() != 1) {
56991 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56992 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56993 if (N->getOpcode() != ISD::DELETED_NODE)
56994 DCI.AddToWorklist(N);
56995 return SDValue(N, 0);
56996 }
56997 }
56998
56999 return SDValue();
57000}
57001
57002// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57004 const X86Subtarget &Subtarget) {
57005 SDLoc DL(N);
57006 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57007 SDValue EFLAGS = N->getOperand(1);
57008
57009 // Try to simplify the EFLAGS and condition code operands.
57010 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57011 return getSETCC(CC, Flags, DL, DAG);
57012
57013 return SDValue();
57014}
57015
57016/// Optimize branch condition evaluation.
57018 const X86Subtarget &Subtarget) {
57019 SDLoc DL(N);
57020 SDValue EFLAGS = N->getOperand(3);
57021 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57022
57023 // Try to simplify the EFLAGS and condition code operands.
57024 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57025 // RAUW them under us.
57026 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57027 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57028 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57029 N->getOperand(1), Cond, Flags);
57030 }
57031
57032 return SDValue();
57033}
57034
57035// TODO: Could we move this to DAGCombine?
57037 SelectionDAG &DAG) {
57038 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57039 // to optimize away operation when it's from a constant.
57040 //
57041 // The general transformation is:
57042 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57043 // AND(VECTOR_CMP(x,y), constant2)
57044 // constant2 = UNARYOP(constant)
57045
57046 // Early exit if this isn't a vector operation, the operand of the
57047 // unary operation isn't a bitwise AND, or if the sizes of the operations
57048 // aren't the same.
57049 EVT VT = N->getValueType(0);
57050 bool IsStrict = N->isStrictFPOpcode();
57051 unsigned NumEltBits = VT.getScalarSizeInBits();
57052 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57053 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57054 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57055 VT.getSizeInBits() != Op0.getValueSizeInBits())
57056 return SDValue();
57057
57058 // Now check that the other operand of the AND is a constant. We could
57059 // make the transformation for non-constant splats as well, but it's unclear
57060 // that would be a benefit as it would not eliminate any operations, just
57061 // perform one more step in scalar code before moving to the vector unit.
57062 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57063 // Bail out if the vector isn't a constant.
57064 if (!BV->isConstant())
57065 return SDValue();
57066
57067 // Everything checks out. Build up the new and improved node.
57068 SDLoc DL(N);
57069 EVT IntVT = BV->getValueType(0);
57070 // Create a new constant of the appropriate type for the transformed
57071 // DAG.
57072 SDValue SourceConst;
57073 if (IsStrict)
57074 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57075 {N->getOperand(0), SDValue(BV, 0)});
57076 else
57077 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57078 // The AND node needs bitcasts to/from an integer vector type around it.
57079 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57080 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57081 MaskConst);
57082 SDValue Res = DAG.getBitcast(VT, NewAnd);
57083 if (IsStrict)
57084 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57085 return Res;
57086 }
57087
57088 return SDValue();
57089}
57090
57091/// If we are converting a value to floating-point, try to replace scalar
57092/// truncate of an extracted vector element with a bitcast. This tries to keep
57093/// the sequence on XMM registers rather than moving between vector and GPRs.
57095 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57096 // to allow being called by any similar cast opcode.
57097 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57098 SDValue Trunc = N->getOperand(0);
57099 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57100 return SDValue();
57101
57102 SDValue ExtElt = Trunc.getOperand(0);
57103 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57104 !isNullConstant(ExtElt.getOperand(1)))
57105 return SDValue();
57106
57107 EVT TruncVT = Trunc.getValueType();
57108 EVT SrcVT = ExtElt.getValueType();
57109 unsigned DestWidth = TruncVT.getSizeInBits();
57110 unsigned SrcWidth = SrcVT.getSizeInBits();
57111 if (SrcWidth % DestWidth != 0)
57112 return SDValue();
57113
57114 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57115 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57116 unsigned VecWidth = SrcVecVT.getSizeInBits();
57117 unsigned NumElts = VecWidth / DestWidth;
57118 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57119 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57120 SDLoc DL(N);
57121 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57122 BitcastVec, ExtElt.getOperand(1));
57123 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57124}
57125
57127 const X86Subtarget &Subtarget) {
57128 bool IsStrict = N->isStrictFPOpcode();
57129 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57130 EVT VT = N->getValueType(0);
57131 EVT InVT = Op0.getValueType();
57132
57133 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57134 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57135 // if hasFP16 support:
57136 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57137 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57138 // else
57139 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57140 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57141 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57142 unsigned ScalarSize = InVT.getScalarSizeInBits();
57143 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57144 ScalarSize >= 64)
57145 return SDValue();
57146 SDLoc dl(N);
57147 EVT DstVT =
57149 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57150 : ScalarSize < 32 ? MVT::i32
57151 : MVT::i64,
57152 InVT.getVectorNumElements());
57153 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57154 if (IsStrict)
57155 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57156 {N->getOperand(0), P});
57157 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57158 }
57159
57160 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57161 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57162 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57163 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57164 VT.getScalarType() != MVT::f16) {
57165 SDLoc dl(N);
57166 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57167 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57168
57169 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57170 if (IsStrict)
57171 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57172 {N->getOperand(0), P});
57173 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57174 }
57175
57176 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57177 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57178 // the optimization here.
57179 SDNodeFlags Flags = N->getFlags();
57180 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57181 if (IsStrict)
57182 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57183 {N->getOperand(0), Op0});
57184 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57185 }
57186
57187 return SDValue();
57188}
57189
57192 const X86Subtarget &Subtarget) {
57193 // First try to optimize away the conversion entirely when it's
57194 // conditionally from a constant. Vectors only.
57195 bool IsStrict = N->isStrictFPOpcode();
57197 return Res;
57198
57199 // Now move on to more general possibilities.
57200 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57201 EVT VT = N->getValueType(0);
57202 EVT InVT = Op0.getValueType();
57203
57204 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57205 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57206 // if hasFP16 support:
57207 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57208 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57209 // else
57210 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57211 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57212 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57213 unsigned ScalarSize = InVT.getScalarSizeInBits();
57214 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57215 ScalarSize >= 64)
57216 return SDValue();
57217 SDLoc dl(N);
57218 EVT DstVT =
57220 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57221 : ScalarSize < 32 ? MVT::i32
57222 : MVT::i64,
57223 InVT.getVectorNumElements());
57224 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57225 if (IsStrict)
57226 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57227 {N->getOperand(0), P});
57228 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57229 }
57230
57231 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57232 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57233 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57234 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57235 VT.getScalarType() != MVT::f16) {
57236 SDLoc dl(N);
57237 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57238 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57239 if (IsStrict)
57240 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57241 {N->getOperand(0), P});
57242 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57243 }
57244
57245 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57246 // vectors and scalars, see if we know that the upper bits are all the sign
57247 // bit, in which case we can truncate the input to i32 and convert from that.
57248 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57249 unsigned BitWidth = InVT.getScalarSizeInBits();
57250 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57251 if (NumSignBits >= (BitWidth - 31)) {
57252 EVT TruncVT = MVT::i32;
57253 if (InVT.isVector())
57254 TruncVT = InVT.changeVectorElementType(TruncVT);
57255 SDLoc dl(N);
57256 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57257 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57258 if (IsStrict)
57259 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57260 {N->getOperand(0), Trunc});
57261 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57262 }
57263 // If we're after legalize and the type is v2i32 we need to shuffle and
57264 // use CVTSI2P.
57265 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57266 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57267 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57268 { 0, 2, -1, -1 });
57269 if (IsStrict)
57270 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57271 {N->getOperand(0), Shuf});
57272 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57273 }
57274 }
57275
57276 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57277 // a 32-bit target where SSE doesn't support i64->FP operations.
57278 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57279 Op0.getOpcode() == ISD::LOAD) {
57280 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57281
57282 // This transformation is not supported if the result type is f16 or f128.
57283 if (VT == MVT::f16 || VT == MVT::f128)
57284 return SDValue();
57285
57286 // If we have AVX512DQ we can use packed conversion instructions unless
57287 // the VT is f80.
57288 if (Subtarget.hasDQI() && VT != MVT::f80)
57289 return SDValue();
57290
57291 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57292 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57293 std::pair<SDValue, SDValue> Tmp =
57294 Subtarget.getTargetLowering()->BuildFILD(
57295 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57296 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57297 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57298 return Tmp.first;
57299 }
57300 }
57301
57302 if (IsStrict)
57303 return SDValue();
57304
57305 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57306 return V;
57307
57308 return SDValue();
57309}
57310
57312 const X86Subtarget &Subtarget) {
57313 EVT VT = N->getValueType(0);
57314 SDValue Src = N->getOperand(0);
57315 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57316 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57317 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57318
57319 return SDValue();
57320}
57321
57322// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57324 const X86Subtarget &Subtarget) {
57325 if (!Subtarget.hasAVX10_2())
57326 return SDValue();
57327
57328 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57329 EVT SrcVT = N->getOperand(0).getValueType();
57330 EVT DstVT = N->getValueType(0);
57331 SDLoc dl(N);
57332
57333 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57334 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57335
57336 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57337 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57338 N->getOperand(0), V2F32Value);
57339
57340 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57341 if (IsSigned)
57342 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57343
57344 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57345 }
57346 return SDValue();
57347}
57348
57350 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57351
57352 for (const SDNode *User : Flags->users()) {
57353 X86::CondCode CC;
57354 switch (User->getOpcode()) {
57355 default:
57356 // Be conservative.
57357 return true;
57358 case X86ISD::SETCC:
57360 CC = (X86::CondCode)User->getConstantOperandVal(0);
57361 break;
57362 case X86ISD::BRCOND:
57363 case X86ISD::CMOV:
57364 CC = (X86::CondCode)User->getConstantOperandVal(2);
57365 break;
57366 }
57367
57368 switch (CC) {
57369 // clang-format off
57370 default: break;
57371 case X86::COND_A: case X86::COND_AE:
57372 case X86::COND_B: case X86::COND_BE:
57373 case X86::COND_O: case X86::COND_NO:
57374 case X86::COND_G: case X86::COND_GE:
57375 case X86::COND_L: case X86::COND_LE:
57376 return true;
57377 // clang-format on
57378 }
57379 }
57380
57381 return false;
57382}
57383
57384static bool onlyZeroFlagUsed(SDValue Flags) {
57385 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57386
57387 for (const SDNode *User : Flags->users()) {
57388 unsigned CCOpNo;
57389 switch (User->getOpcode()) {
57390 default:
57391 // Be conservative.
57392 return false;
57393 case X86ISD::SETCC:
57395 CCOpNo = 0;
57396 break;
57397 case X86ISD::BRCOND:
57398 case X86ISD::CMOV:
57399 CCOpNo = 2;
57400 break;
57401 }
57402
57403 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57404 if (CC != X86::COND_E && CC != X86::COND_NE)
57405 return false;
57406 }
57407
57408 return true;
57409}
57410
57413 const X86Subtarget &Subtarget) {
57414 // Only handle test patterns.
57415 if (!isNullConstant(N->getOperand(1)))
57416 return SDValue();
57417
57418 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57419 // and use its flags directly.
57420 // TODO: Maybe we should try promoting compares that only use the zero flag
57421 // first if we can prove the upper bits with computeKnownBits?
57422 SDLoc dl(N);
57423 SDValue Op = N->getOperand(0);
57424 EVT VT = Op.getValueType();
57425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57426
57427 if (SDValue CMP =
57428 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57429 return CMP;
57430
57431 // If we have a constant logical shift that's only used in a comparison
57432 // against zero turn it into an equivalent AND. This allows turning it into
57433 // a TEST instruction later.
57434 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57435 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57436 onlyZeroFlagUsed(SDValue(N, 0))) {
57437 unsigned BitWidth = VT.getSizeInBits();
57438 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57439 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57440 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57441 APInt Mask = Op.getOpcode() == ISD::SRL
57442 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57443 : APInt::getLowBitsSet(BitWidth, MaskBits);
57444 if (Mask.isSignedIntN(32)) {
57445 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57446 DAG.getConstant(Mask, dl, VT));
57447 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57448 DAG.getConstant(0, dl, VT));
57449 }
57450 }
57451 }
57452
57453 // If we're extracting from a avx512 bool vector and comparing against zero,
57454 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57455 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57456 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57457 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57458 SDValue Src = Op.getOperand(0);
57459 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57460 isNullConstant(Src.getOperand(1)) &&
57461 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57462 SDValue BoolVec = Src.getOperand(0);
57463 unsigned ShAmt = 0;
57464 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57465 ShAmt = BoolVec.getConstantOperandVal(1);
57466 BoolVec = BoolVec.getOperand(0);
57467 }
57468 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57469 EVT VecVT = BoolVec.getValueType();
57470 unsigned BitWidth = VecVT.getVectorNumElements();
57471 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57472 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57473 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57474 Op = DAG.getBitcast(BCVT, BoolVec);
57475 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57476 DAG.getConstant(Mask, dl, BCVT));
57477 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57478 DAG.getConstant(0, dl, BCVT));
57479 }
57480 }
57481 }
57482
57483 // Peek through any zero-extend if we're only testing for a zero result.
57484 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57485 SDValue Src = Op.getOperand(0);
57486 EVT SrcVT = Src.getValueType();
57487 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57488 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57489 DAG.getConstant(0, dl, SrcVT));
57490 }
57491
57492 // Look for a truncate.
57493 if (Op.getOpcode() != ISD::TRUNCATE)
57494 return SDValue();
57495
57496 SDValue Trunc = Op;
57497 Op = Op.getOperand(0);
57498
57499 // See if we can compare with zero against the truncation source,
57500 // which should help using the Z flag from many ops. Only do this for
57501 // i32 truncated op to prevent partial-reg compares of promoted ops.
57502 EVT OpVT = Op.getValueType();
57503 APInt UpperBits =
57505 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57506 onlyZeroFlagUsed(SDValue(N, 0))) {
57507 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57508 DAG.getConstant(0, dl, OpVT));
57509 }
57510
57511 // After this the truncate and arithmetic op must have a single use.
57512 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57513 return SDValue();
57514
57515 unsigned NewOpc;
57516 switch (Op.getOpcode()) {
57517 default: return SDValue();
57518 case ISD::AND:
57519 // Skip and with constant. We have special handling for and with immediate
57520 // during isel to generate test instructions.
57521 if (isa<ConstantSDNode>(Op.getOperand(1)))
57522 return SDValue();
57523 NewOpc = X86ISD::AND;
57524 break;
57525 case ISD::OR: NewOpc = X86ISD::OR; break;
57526 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57527 case ISD::ADD:
57528 // If the carry or overflow flag is used, we can't truncate.
57530 return SDValue();
57531 NewOpc = X86ISD::ADD;
57532 break;
57533 case ISD::SUB:
57534 // If the carry or overflow flag is used, we can't truncate.
57536 return SDValue();
57537 NewOpc = X86ISD::SUB;
57538 break;
57539 }
57540
57541 // We found an op we can narrow. Truncate its inputs.
57542 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57543 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57544
57545 // Use a X86 specific opcode to avoid DAG combine messing with it.
57546 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57547 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57548
57549 // For AND, keep a CMP so that we can match the test pattern.
57550 if (NewOpc == X86ISD::AND)
57551 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57552 DAG.getConstant(0, dl, VT));
57553
57554 // Return the flags.
57555 return Op.getValue(1);
57556}
57557
57560 const X86Subtarget &ST) {
57561 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57562 "Expected X86ISD::ADD or X86ISD::SUB");
57563
57564 SDLoc DL(N);
57565 SDValue LHS = N->getOperand(0);
57566 SDValue RHS = N->getOperand(1);
57567 MVT VT = LHS.getSimpleValueType();
57568 bool IsSub = X86ISD::SUB == N->getOpcode();
57569 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57570
57571 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57572 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57573 return CMP;
57574
57575 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57576 if (!N->hasAnyUseOfValue(1)) {
57577 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57578 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57579 }
57580
57581 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57582 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57583 SDValue Ops[] = {N0, N1};
57584 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57585 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57586 SDValue Op(N, 0);
57587 if (Negate) {
57588 // Bail if this is only used by a user of the x86 add/sub.
57589 if (GenericAddSub->hasOneUse() &&
57590 GenericAddSub->user_begin()->isOnlyUserOf(N))
57591 return;
57592 Op = DAG.getNegative(Op, DL, VT);
57593 }
57594 DCI.CombineTo(GenericAddSub, Op);
57595 }
57596 };
57597 MatchGeneric(LHS, RHS, false);
57598 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57599
57600 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57601 // EFLAGS result doesn't change.
57602 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57603 /*ZeroSecondOpOnly*/ true);
57604}
57605
57607 SDValue LHS = N->getOperand(0);
57608 SDValue RHS = N->getOperand(1);
57609 SDValue BorrowIn = N->getOperand(2);
57610
57611 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57612 MVT VT = N->getSimpleValueType(0);
57613 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57614 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57615 }
57616
57617 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57618 // iff the flag result is dead.
57619 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57620 !N->hasAnyUseOfValue(1))
57621 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57622 LHS.getOperand(1), BorrowIn);
57623
57624 return SDValue();
57625}
57626
57627// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57630 SDValue LHS = N->getOperand(0);
57631 SDValue RHS = N->getOperand(1);
57632 SDValue CarryIn = N->getOperand(2);
57633 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57634 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57635
57636 // Canonicalize constant to RHS.
57637 if (LHSC && !RHSC)
57638 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57639 CarryIn);
57640
57641 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57642 // the result is either zero or one (depending on the input carry bit).
57643 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57644 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57645 // We don't have a good way to replace an EFLAGS use, so only do this when
57646 // dead right now.
57647 SDValue(N, 1).use_empty()) {
57648 SDLoc DL(N);
57649 EVT VT = N->getValueType(0);
57650 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57651 SDValue Res1 = DAG.getNode(
57652 ISD::AND, DL, VT,
57654 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57655 DAG.getConstant(1, DL, VT));
57656 return DCI.CombineTo(N, Res1, CarryOut);
57657 }
57658
57659 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57660 // iff the flag result is dead.
57661 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57662 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57663 SDLoc DL(N);
57664 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57665 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57666 DAG.getConstant(0, DL, LHS.getValueType()),
57667 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57668 }
57669
57670 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57671 MVT VT = N->getSimpleValueType(0);
57672 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57673 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57674 }
57675
57676 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57677 // iff the flag result is dead.
57678 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57679 !N->hasAnyUseOfValue(1))
57680 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57681 LHS.getOperand(1), CarryIn);
57682
57683 return SDValue();
57684}
57685
57687 const SDLoc &DL, EVT VT,
57688 const X86Subtarget &Subtarget) {
57689 using namespace SDPatternMatch;
57690
57691 // Example of pattern we try to detect:
57692 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57693 //(add (build_vector (extract_elt t, 0),
57694 // (extract_elt t, 2),
57695 // (extract_elt t, 4),
57696 // (extract_elt t, 6)),
57697 // (build_vector (extract_elt t, 1),
57698 // (extract_elt t, 3),
57699 // (extract_elt t, 5),
57700 // (extract_elt t, 7)))
57701
57702 if (!Subtarget.hasSSE2())
57703 return SDValue();
57704
57705 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57706 VT.getVectorNumElements() < 4 ||
57708 return SDValue();
57709
57710 SDValue Op0, Op1, Accum;
57715 m_Value(Op1))))))
57716 return SDValue();
57717
57718 // Check if one of Op0,Op1 is of the form:
57719 // (build_vector (extract_elt Mul, 0),
57720 // (extract_elt Mul, 2),
57721 // (extract_elt Mul, 4),
57722 // ...
57723 // the other is of the form:
57724 // (build_vector (extract_elt Mul, 1),
57725 // (extract_elt Mul, 3),
57726 // (extract_elt Mul, 5),
57727 // ...
57728 // and identify Mul.
57729 SDValue Mul;
57730 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57731 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57732 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57733 // TODO: Be more tolerant to undefs.
57734 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57735 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57736 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57737 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57738 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57739 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57740 return SDValue();
57741 // Commutativity of mul allows factors of a product to reorder.
57742 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57743 std::swap(Idx0L, Idx1L);
57744 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57745 std::swap(Idx0H, Idx1H);
57746 // Commutativity of add allows pairs of factors to reorder.
57747 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57748 std::swap(Idx0L, Idx0H);
57749 std::swap(Idx1L, Idx1H);
57750 }
57751 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57752 Idx1H != 2 * i + 3)
57753 return SDValue();
57754 if (!Mul) {
57755 // First time an extract_elt's source vector is visited. Must be a MUL
57756 // with 2X number of vector elements than the BUILD_VECTOR.
57757 // Both extracts must be from same MUL.
57758 Mul = Vec0L;
57759 if (Mul.getOpcode() != ISD::MUL ||
57760 Mul.getValueType().getVectorNumElements() != 2 * e)
57761 return SDValue();
57762 }
57763 // Check that the extract is from the same MUL previously seen.
57764 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57765 return SDValue();
57766 }
57767
57768 // Check if the Mul source can be safely shrunk.
57770 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57772 return SDValue();
57773
57774 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57775 VT.getVectorNumElements() * 2);
57776 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57777 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57778
57779 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57781 EVT InVT = Ops[0].getValueType();
57782 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57783 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57784 InVT.getVectorNumElements() / 2);
57785 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57786 };
57787 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57788 if (Accum)
57789 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57790 return R;
57791}
57792
57793// Attempt to turn this pattern into PMADDWD.
57794// (add (mul (sext (build_vector)), (sext (build_vector))),
57795// (mul (sext (build_vector)), (sext (build_vector)))
57797 const SDLoc &DL, EVT VT,
57798 const X86Subtarget &Subtarget) {
57799 using namespace SDPatternMatch;
57800
57801 if (!Subtarget.hasSSE2())
57802 return SDValue();
57803
57804 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57805 VT.getVectorNumElements() < 4 ||
57807 return SDValue();
57808
57809 // All inputs need to be sign extends.
57810 // TODO: Support ZERO_EXTEND from known positive?
57811 SDValue N00, N01, N10, N11;
57812 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57813 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57814 return SDValue();
57815
57816 // Must be extending from vXi16.
57817 EVT InVT = N00.getValueType();
57818 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57819 N10.getValueType() != InVT || N11.getValueType() != InVT)
57820 return SDValue();
57821
57822 // All inputs should be build_vectors.
57823 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57824 N01.getOpcode() != ISD::BUILD_VECTOR ||
57825 N10.getOpcode() != ISD::BUILD_VECTOR ||
57827 return SDValue();
57828
57829 // For each element, we need to ensure we have an odd element from one vector
57830 // multiplied by the odd element of another vector and the even element from
57831 // one of the same vectors being multiplied by the even element from the
57832 // other vector. So we need to make sure for each element i, this operator
57833 // is being performed:
57834 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57835 SDValue In0, In1;
57836 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57837 SDValue N00Elt = N00.getOperand(i);
57838 SDValue N01Elt = N01.getOperand(i);
57839 SDValue N10Elt = N10.getOperand(i);
57840 SDValue N11Elt = N11.getOperand(i);
57841 // TODO: Be more tolerant to undefs.
57842 SDValue N00In, N01In, N10In, N11In;
57843 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57844 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57845 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57846 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57847 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57848 return SDValue();
57849 // Add is commutative so indices can be reordered.
57850 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57851 std::swap(IdxN00, IdxN10);
57852 std::swap(IdxN01, IdxN11);
57853 }
57854 // N0 indices be the even element. N1 indices must be the next odd element.
57855 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57856 IdxN11 != 2 * i + 1)
57857 return SDValue();
57858
57859 // First time we find an input capture it.
57860 if (!In0) {
57861 In0 = N00In;
57862 In1 = N01In;
57863
57864 // The input vectors must be at least as wide as the output.
57865 // If they are larger than the output, we extract subvector below.
57866 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57867 In1.getValueSizeInBits() < VT.getSizeInBits())
57868 return SDValue();
57869 }
57870 // Mul is commutative so the input vectors can be in any order.
57871 // Canonicalize to make the compares easier.
57872 if (In0 != N00In)
57873 std::swap(N00In, N01In);
57874 if (In0 != N10In)
57875 std::swap(N10In, N11In);
57876 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57877 return SDValue();
57878 }
57879
57880 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57882 EVT OpVT = Ops[0].getValueType();
57883 assert(OpVT.getScalarType() == MVT::i16 &&
57884 "Unexpected scalar element type");
57885 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57886 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57887 OpVT.getVectorNumElements() / 2);
57888 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57889 };
57890
57891 // If the output is narrower than an input, extract the low part of the input
57892 // vector.
57893 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57894 VT.getVectorNumElements() * 2);
57895 if (OutVT16.bitsLT(In0.getValueType())) {
57896 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57897 DAG.getVectorIdxConstant(0, DL));
57898 }
57899 if (OutVT16.bitsLT(In1.getValueType())) {
57900 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57901 DAG.getVectorIdxConstant(0, DL));
57902 }
57903 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57904 PMADDBuilder);
57905}
57906
57907// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57908// If upper element in each pair of both VPMADDWD are zero then we can merge
57909// the operand elements and use the implicit add of VPMADDWD.
57910// TODO: Add support for VPMADDUBSW (which isn't commutable).
57912 const SDLoc &DL, EVT VT) {
57913 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57914 return SDValue();
57915
57916 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57917 if (VT.getSizeInBits() > 128)
57918 return SDValue();
57919
57920 unsigned NumElts = VT.getVectorNumElements();
57921 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57923 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57924
57925 bool Op0HiZero =
57926 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57927 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57928 bool Op1HiZero =
57929 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57930 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57931
57932 // TODO: Check for zero lower elements once we have actual codegen that
57933 // creates them.
57934 if (!Op0HiZero || !Op1HiZero)
57935 return SDValue();
57936
57937 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57938 SmallVector<int> Mask;
57939 for (int i = 0; i != (int)NumElts; ++i) {
57940 Mask.push_back(2 * i);
57941 Mask.push_back(2 * (i + NumElts));
57942 }
57943
57944 SDValue LHS =
57945 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57946 SDValue RHS =
57947 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57948 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57949}
57950
57951/// CMOV of constants requires materializing constant operands in registers.
57952/// Try to fold those constants into an 'add' instruction to reduce instruction
57953/// count. We do this with CMOV rather the generic 'select' because there are
57954/// earlier folds that may be used to turn select-of-constants into logic hacks.
57956 SelectionDAG &DAG,
57957 const X86Subtarget &Subtarget) {
57958 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57959 // better because we eliminate 1-2 instructions. This transform is still
57960 // an improvement without zero operands because we trade 2 move constants and
57961 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57962 // immediate asm operands (fit in 32-bits).
57963 auto isSuitableCmov = [](SDValue V) {
57964 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57965 return false;
57966 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57967 !isa<ConstantSDNode>(V.getOperand(1)))
57968 return false;
57969 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57970 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57971 V.getConstantOperandAPInt(1).isSignedIntN(32));
57972 };
57973
57974 // Match an appropriate CMOV as the first operand of the add.
57975 SDValue Cmov = N->getOperand(0);
57976 SDValue OtherOp = N->getOperand(1);
57977 if (!isSuitableCmov(Cmov))
57978 std::swap(Cmov, OtherOp);
57979 if (!isSuitableCmov(Cmov))
57980 return SDValue();
57981
57982 // Don't remove a load folding opportunity for the add. That would neutralize
57983 // any improvements from removing constant materializations.
57984 if (X86::mayFoldLoad(OtherOp, Subtarget))
57985 return SDValue();
57986
57987 EVT VT = N->getValueType(0);
57988 SDValue FalseOp = Cmov.getOperand(0);
57989 SDValue TrueOp = Cmov.getOperand(1);
57990
57991 // We will push the add through the select, but we can potentially do better
57992 // if we know there is another add in the sequence and this is pointer math.
57993 // In that case, we can absorb an add into the trailing memory op and avoid
57994 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57995 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57996 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57997 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57998 all_of(N->users(), [&](SDNode *Use) {
57999 auto *MemNode = dyn_cast<MemSDNode>(Use);
58000 return MemNode && MemNode->getBasePtr().getNode() == N;
58001 })) {
58002 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58003 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58004 // it is possible that choosing op1 might be better.
58005 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58006 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58007 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58008 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58009 Cmov.getOperand(2), Cmov.getOperand(3));
58010 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58011 }
58012
58013 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58014 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58015 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58016 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58017 Cmov.getOperand(3));
58018}
58019
58020// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58021// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58023 EVT VT, const X86Subtarget &Subtarget) {
58024 using namespace SDPatternMatch;
58025 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58026 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58027 return SDValue();
58028
58029 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58030 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58031 VT.getSizeInBits() < 512)
58032 return SDValue();
58033
58034 const auto TotalSize = VT.getSizeInBits();
58035 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58036 return SDValue();
58037
58038 SDValue X, Y, Acc;
58039 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58040 return SDValue();
58041
58042 KnownBits KnownX = DAG.computeKnownBits(X);
58043 if (KnownX.countMinLeadingZeros() < 12)
58044 return SDValue();
58045 KnownBits KnownY = DAG.computeKnownBits(Y);
58046 if (KnownY.countMinLeadingZeros() < 12)
58047 return SDValue();
58048 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58049 if (KnownMul.countMinLeadingZeros() < 12)
58050 return SDValue();
58051
58052 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58053 ArrayRef<SDValue> SubOps) {
58054 EVT SubVT = SubOps[0].getValueType();
58055 assert(SubVT.getScalarSizeInBits() == 64 &&
58056 "Unexpected element size, only supports 64bit size");
58057 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58058 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58059 };
58060
58061 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58062 /*CheckBWI*/ false);
58063}
58064
58067 const X86Subtarget &Subtarget) {
58068 using namespace SDPatternMatch;
58069 EVT VT = N->getValueType(0);
58070 SDValue Op0 = N->getOperand(0);
58071 SDValue Op1 = N->getOperand(1);
58072 SDLoc DL(N);
58073
58074 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58075 return Select;
58076
58077 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58078 return MAdd;
58079 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58080 return MAdd;
58081 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58082 return MAdd;
58083
58084 // Try to synthesize horizontal adds from adds of shuffles.
58085 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58086 return V;
58087
58088 // Canonicalize hidden LEA pattern:
58089 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58090 // iff c < 4
58091 if (VT == MVT::i32 || VT == MVT::i64) {
58092 SDValue Y, Z, Shift;
58093 APInt Amt;
58094 if (sd_match(
58096 m_Shl(m_Value(), m_ConstInt(Amt))),
58097 m_Value(Y))),
58098 m_Value(Z))) &&
58099 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58100 return DAG.getNode(ISD::SUB, DL, VT,
58101 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58102 }
58103 }
58104
58105 SDValue X, Y;
58106
58107 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58108 // iff X and Y won't overflow.
58109 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58111 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58112 MVT OpVT = X.getSimpleValueType();
58113 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58114 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58115 getZeroVector(OpVT, Subtarget, DAG, DL));
58116 }
58117
58118 if (VT.isVector()) {
58119 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58121
58122 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58123 // (sub Y, (sext (vXi1 X))).
58124 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58125 // in generic DAG combine without a legal type check, but adding this there
58126 // caused regressions.
58127 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58129 m_Value(Y)))) {
58130 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58131 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58132 }
58133
58134 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58135 // canonicalisation as we don't have good vXi8 shifts.
58136 if (VT.getScalarType() == MVT::i8 &&
58138 SDValue Cmp =
58139 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58140 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58141 }
58142 }
58143
58144 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58145 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58146 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58147 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58148 if (sd_match(N, m_Add(m_Value(Accum),
58151 m_Value(Lo1)),
58153 m_Value(Hi1)))))) {
58154 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58155 concatSubVectors(Lo0, Hi0, DAG, DL),
58156 concatSubVectors(Lo1, Hi1, DAG, DL));
58157 }
58158 }
58159
58160 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58161 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58162 X86::isZeroNode(Op0.getOperand(1))) {
58163 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58164 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58165 Op0.getOperand(0), Op0.getOperand(2));
58166 }
58167
58168 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58169 return IFMA52;
58170
58171 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58172}
58173
58174// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58175// condition comes from the subtract node that produced -X. This matches the
58176// cmov expansion for absolute value. By swapping the operands we convert abs
58177// to nabs.
58178static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58179 SelectionDAG &DAG) {
58180 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58181 return SDValue();
58182
58183 SDValue Cond = N1.getOperand(3);
58184 if (Cond.getOpcode() != X86ISD::SUB)
58185 return SDValue();
58186 assert(Cond.getResNo() == 1 && "Unexpected result number");
58187
58188 SDValue FalseOp = N1.getOperand(0);
58189 SDValue TrueOp = N1.getOperand(1);
58191
58192 // ABS condition should come from a negate operation.
58193 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58194 isNullConstant(Cond.getOperand(0))) {
58195 // Get the X and -X from the negate.
58196 SDValue NegX = Cond.getValue(0);
58197 SDValue X = Cond.getOperand(1);
58198
58199 // Cmov operands should be X and NegX. Order doesn't matter.
58200 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58201 return SDValue();
58202
58203 // Build a new CMOV with the operands swapped.
58204 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58205 N1.getOperand(2), Cond);
58206 // Convert sub to add.
58207 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58208 }
58209
58210 // Handle ABD special case:
58211 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58212 // ABD condition should come from a pair of matching subtracts.
58213 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58214 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58215 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58216 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58217 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58218 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58219 // Build a new CMOV with the operands swapped.
58220 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58221 Cond);
58222 }
58223
58224 return SDValue();
58225}
58226
58228 SDValue Op0 = N->getOperand(0);
58229 SDValue Op1 = N->getOperand(1);
58230
58231 // (sub C (zero_extend (setcc)))
58232 // =>
58233 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58234 // Don't disturb (sub 0 setcc), which is easily done with neg.
58235 EVT VT = N->getValueType(0);
58236 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58237 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58238 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58239 Op1.getOperand(0).hasOneUse()) {
58240 SDValue SetCC = Op1.getOperand(0);
58243 APInt NewImm = Op0C->getAPIntValue() - 1;
58244 SDLoc DL(Op1);
58245 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58246 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58247 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58248 DAG.getConstant(NewImm, DL, VT));
58249 }
58250
58251 return SDValue();
58252}
58253
58255 if (N->getConstantOperandVal(3) != X86::COND_NE)
58256 return SDValue();
58257
58258 SDValue Sub = N->getOperand(4);
58259 if (Sub.getOpcode() != X86ISD::SUB)
58260 return SDValue();
58261
58262 SDValue Op1 = Sub.getOperand(1);
58263
58264 if (!X86::isZeroNode(Sub.getOperand(0)))
58265 return SDValue();
58266
58267 SDLoc DL(N);
58268 SmallVector<SDValue, 5> Ops(N->op_values());
58269 if (Op1.getOpcode() == X86ISD::SETCC) {
58270 // res, flags2 = sub 0, (setcc cc, flag)
58271 // cload/cstore ..., cond_ne, flag2
58272 // ->
58273 // cload/cstore cc, flag
58274 Ops[3] = Op1.getOperand(0);
58275 Ops[4] = Op1.getOperand(1);
58276 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58277 SDValue Src = Op1;
58278 SDValue Op10 = Op1.getOperand(0);
58279 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58280 // res, flags2 = sub 0, (and (xor X, -1), Y)
58281 // cload/cstore ..., cond_ne, flag2
58282 // ->
58283 // res, flags2 = sub 0, (and X, Y)
58284 // cload/cstore ..., cond_e, flag2
58285 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58286 Op1.getOperand(1));
58287 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58288 }
58289 // res, flags2 = sub 0, (and X, Y)
58290 // cload/cstore ..., cc, flag2
58291 // ->
58292 // res, flags2 = cmp (and X, Y), 0
58293 // cload/cstore ..., cc, flag2
58294 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58295 } else {
58296 return SDValue();
58297 }
58298
58299 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58300 cast<MemSDNode>(N)->getMemoryVT(),
58301 cast<MemSDNode>(N)->getMemOperand());
58302}
58303
58306 const X86Subtarget &Subtarget) {
58307 EVT VT = N->getValueType(0);
58308 SDValue Op0 = N->getOperand(0);
58309 SDValue Op1 = N->getOperand(1);
58310 SDLoc DL(N);
58311
58312 auto IsNonOpaqueConstant = [&](SDValue Op) {
58314 /*AllowOpaques*/ false);
58315 };
58316
58317 // X86 can't encode an immediate LHS of a sub. See if we can push the
58318 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58319 // one use and a constant, invert the immediate, saving one register.
58320 // However, ignore cases where C1 is 0, as those will become a NEG.
58321 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58322 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58323 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58324 Op1->hasOneUse()) {
58325 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58326 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58327 SDValue NewAdd =
58328 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58329 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58330 }
58331
58332 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58333 return V;
58334
58335 // Try to synthesize horizontal subs from subs of shuffles.
58336 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58337 return V;
58338
58339 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58340 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58341 X86::isZeroNode(Op1.getOperand(1))) {
58342 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58343 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58344 Op1.getOperand(0), Op1.getOperand(2));
58345 }
58346
58347 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58348 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58349 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58350 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58351 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58352 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58353 Op1.getOperand(1), Op1.getOperand(2));
58354 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58355 }
58356
58357 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58358 return V;
58359
58360 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58361 return V;
58362
58363 return combineSubSetcc(N, DAG);
58364}
58365
58367 const X86Subtarget &Subtarget) {
58368 unsigned Opcode = N->getOpcode();
58369 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58370 "Unknown PCMP opcode");
58371
58372 SDValue LHS = N->getOperand(0);
58373 SDValue RHS = N->getOperand(1);
58374 MVT VT = N->getSimpleValueType(0);
58375 unsigned EltBits = VT.getScalarSizeInBits();
58376 unsigned NumElts = VT.getVectorNumElements();
58377 SDLoc DL(N);
58378
58379 if (LHS == RHS)
58380 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58381 : DAG.getConstant(0, DL, VT);
58382
58383 // Constant Folding.
58384 // PCMPEQ(X,UNDEF) -> UNDEF
58385 // PCMPGT(X,UNDEF) -> 0
58386 // PCMPGT(UNDEF,X) -> 0
58387 APInt LHSUndefs, RHSUndefs;
58388 SmallVector<APInt> LHSBits, RHSBits;
58389 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58390 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58391 APInt Ones = APInt::getAllOnes(EltBits);
58392 APInt Zero = APInt::getZero(EltBits);
58393 SmallVector<APInt> Results(NumElts);
58394 for (unsigned I = 0; I != NumElts; ++I) {
58395 if (Opcode == X86ISD::PCMPEQ) {
58396 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58397 } else {
58398 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58399 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58400 }
58401 }
58402 if (Opcode == X86ISD::PCMPEQ)
58403 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58404 return getConstVector(Results, VT, DAG, DL);
58405 }
58406
58407 return SDValue();
58408}
58409
58410// Helper to determine if we can convert an integer comparison to a float
58411// comparison byt casting the operands.
58412static std::optional<unsigned>
58413CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58414 unsigned NumSignificantBitsRHS) {
58415 MVT SVT = VT.getScalarType();
58416 assert(SVT == MVT::f32 && "Only tested for float so far");
58417 const fltSemantics &Sem = SVT.getFltSemantics();
58418 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58419 "Only PCMPEQ/PCMPGT currently supported");
58420
58421 // TODO: Handle bitcastable integers.
58422
58423 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58424 // a fp value.
58425 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58426 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58427 return ISD::SINT_TO_FP;
58428
58429 return std::nullopt;
58430}
58431
58432/// Helper that combines an array of subvector ops as if they were the operands
58433/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58434/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58437 const X86Subtarget &Subtarget,
58438 unsigned Depth) {
58439 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58440 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58441
58442 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58443 return DAG.getUNDEF(VT);
58444
58445 if (llvm::all_of(Ops, [](SDValue Op) {
58446 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58447 }))
58448 return getZeroVector(VT, Subtarget, DAG, DL);
58449
58451 return SDValue(); // Limit search depth.
58452
58453 SDValue Op0 = Ops[0];
58454 bool IsSplat = llvm::all_equal(Ops);
58455 unsigned NumOps = Ops.size();
58456 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58457 LLVMContext &Ctx = *DAG.getContext();
58458
58459 // Repeated subvectors.
58460 if (IsSplat &&
58461 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58462 // If this broadcast is inserted into both halves, use a larger broadcast.
58463 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58464 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58465
58466 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58467 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58468 (Subtarget.hasAVX2() ||
58470 VT.getScalarType(), Subtarget)))
58471 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58472 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58473 Op0.getOperand(0),
58474 DAG.getVectorIdxConstant(0, DL)));
58475
58476 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58477 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58478 (Subtarget.hasAVX2() ||
58479 (EltSizeInBits >= 32 &&
58480 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58481 Op0.getOperand(0).getValueType() == VT.getScalarType())
58482 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58483
58484 // concat_vectors(extract_subvector(splat(x)),
58485 // extract_subvector(splat(x))) -> splat(x)
58486 // concat_vectors(extract_subvector(subv_broadcast(x)),
58487 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58488 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58489 Op0.getOperand(0).getValueType() == VT) {
58490 SDValue SrcVec = Op0.getOperand(0);
58491 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58492 return SrcVec;
58493 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58494 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58495 return SrcVec;
58496 }
58497
58498 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58499 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58500 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58501 return DAG.getNode(Op0.getOpcode(), DL, VT,
58503 Op0.getOperand(0), Op0.getOperand(0)),
58504 Op0.getOperand(1));
58505 }
58506
58507 // TODO: This should go in combineX86ShufflesRecursively eventually.
58508 if (NumOps == 2) {
58509 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58510 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58511 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58513 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58514 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58515 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58516 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58517 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58518 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58519 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58520 // Only concat of subvector high halves which vperm2x128 is best at or if
58521 // it should fold into a subvector broadcast.
58522 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58523 SrcVT1.is256BitVector()) {
58524 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58525 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58526 "Bad subvector index");
58527 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58528 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58529 unsigned Index = 0;
58530 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58531 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58532 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58533 DAG.getBitcast(VT, Src0.getOperand(0)),
58534 DAG.getBitcast(VT, Src1.getOperand(0)),
58535 DAG.getTargetConstant(Index, DL, MVT::i8));
58536 }
58537 }
58538 // Widen extract_subvector
58539 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58540 // --> extract_subvector(x,lo)
58541 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58542 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58543 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58544 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58545 return DAG.getBitcast(VT,
58547 Src0.getConstantOperandVal(1),
58548 DAG, DL, VT.getSizeInBits()));
58549 }
58550 }
58551 }
58552
58553 // Repeated opcode.
58554 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58555 // but it currently struggles with different vector widths.
58556 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58557 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58558 })) {
58559 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58561 for (SDValue SubOp : SubOps)
58562 Subs.push_back(SubOp.getOperand(I));
58563 // Attempt to peek through bitcasts and concat the original subvectors.
58564 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58565 if (SubVT.isSimple() && SubVT.isVector()) {
58566 MVT ConcatVT =
58568 SubVT.getVectorElementCount() * Subs.size());
58569 for (SDValue &Sub : Subs)
58570 Sub = DAG.getBitcast(SubVT, Sub);
58571 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58572 Subtarget, Depth + 1))
58573 return DAG.getBitcast(VT, ConcatSrc);
58574 return DAG.getBitcast(
58575 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58576 }
58577 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58578 };
58579 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58580 bool AllConstants = true;
58581 bool AllSubs = true;
58582 unsigned VecSize = VT.getSizeInBits();
58583 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58584 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58585 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58586 }))
58587 return true;
58588 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58589 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58590 unsigned SubSize = BC.getValueSizeInBits();
58591 unsigned EltSize = BC.getScalarValueSizeInBits();
58592 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58594 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58595 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58596 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58597 }
58598 return AllConstants || AllSubs;
58599 };
58600 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58601 bool AllConstants = true;
58603 for (SDValue SubOp : SubOps) {
58604 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58605 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58607 Subs.push_back(SubOp.getOperand(I));
58608 }
58609 if (AllConstants)
58610 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58611 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58612 };
58613
58614 unsigned Opcode = Op0.getOpcode();
58615 switch (Opcode) {
58616 case ISD::BITCAST: {
58617 // TODO: Support AVX1/AVX2 bitcasts.
58619 for (SDValue SubOp : Ops)
58620 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58621 EVT InnerVT = SubOps[0].getValueType();
58622 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58623 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58624 (Subtarget.hasBWI() ||
58625 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58626 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58627 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58628 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58629 return Op.getValueType() == InnerVT;
58630 })) {
58631 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58632 MVT ConcatVT = MVT::getVectorVT(
58633 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58634 if (SDValue ConcatSrc = combineConcatVectorOps(
58635 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58636 return DAG.getBitcast(VT, ConcatSrc);
58637 }
58638 break;
58639 }
58640 case ISD::VECTOR_SHUFFLE: {
58641 // TODO: Generalize NumOps support.
58642 if (!IsSplat && NumOps == 2 &&
58643 ((VT.is256BitVector() &&
58644 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58645 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58646 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58647 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58648 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58649 if (Concat0 || Concat1 ||
58650 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58651 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58652 Subtarget.hasVBMI())) {
58653 int NumSubElts = Op0.getValueType().getVectorNumElements();
58654 SmallVector<int> NewMask;
58655 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58656 M = M >= NumSubElts ? M + NumSubElts : M;
58657 NewMask.push_back(M);
58658 }
58659 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58660 if (0 <= M)
58661 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58662 NewMask.push_back(M);
58663 }
58664 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58665 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58666 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58667 }
58668 }
58669 break;
58670 }
58671 case X86ISD::VBROADCAST: {
58672 // TODO: 512-bit VBROADCAST concatenation.
58673 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58674 return Op.getOperand(0).getValueType().is128BitVector();
58675 })) {
58676 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58677 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58678 ConcatSubOperand(VT, Ops, 0),
58679 ConcatSubOperand(VT, Ops, 0));
58680 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58681 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58682 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58684 DL, VT, ConcatSubOperand(VT, Ops, 0),
58685 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58686 }
58687 break;
58688 }
58689 case X86ISD::MOVDDUP:
58690 case X86ISD::MOVSHDUP:
58691 case X86ISD::MOVSLDUP: {
58692 if (!IsSplat && (VT.is256BitVector() ||
58693 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58694 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58695 break;
58696 }
58697 case X86ISD::SHUFP: {
58698 if (!IsSplat &&
58699 (VT == MVT::v8f32 ||
58700 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58701 llvm::all_of(Ops, [Op0](SDValue Op) {
58702 return Op.getOperand(2) == Op0.getOperand(2);
58703 })) {
58704 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58705 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58706 if (Concat0 || Concat1)
58707 return DAG.getNode(Opcode, DL, VT,
58708 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58709 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58710 Op0.getOperand(2));
58711 }
58712 break;
58713 }
58714 case X86ISD::UNPCKH:
58715 case X86ISD::UNPCKL: {
58716 // TODO: UNPCK should use CombineSubOperand
58717 // Don't concatenate build_vector patterns.
58718 if (!IsSplat &&
58719 ((VT.is256BitVector() &&
58720 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58721 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58722 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58723 none_of(Ops, [](SDValue Op) {
58724 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58726 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58728 })) {
58729 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58730 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58731 if (Concat0 || Concat1 ||
58732 (Subtarget.hasInt256() && EltSizeInBits == 64))
58733 return DAG.getNode(Opcode, DL, VT,
58734 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58735 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58736 }
58737 break;
58738 }
58739 case X86ISD::PSHUFHW:
58740 case X86ISD::PSHUFLW:
58741 case X86ISD::PSHUFD:
58742 if (!IsSplat &&
58743 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58744 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58745 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58746 llvm::all_of(Ops, [Op0](SDValue Op) {
58747 return Op.getOperand(1) == Op0.getOperand(1);
58748 })) {
58749 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58750 Op0.getOperand(1));
58751 }
58752 [[fallthrough]];
58753 case X86ISD::VPERMILPI:
58754 if (!IsSplat && EltSizeInBits == 32 &&
58755 (VT.is256BitVector() ||
58756 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58757 all_of(Ops, [&Op0](SDValue Op) {
58758 return Op0.getOperand(1) == Op.getOperand(1);
58759 })) {
58760 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58761 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58762 Res =
58763 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58764 return DAG.getBitcast(VT, Res);
58765 }
58766 break;
58767 case X86ISD::VPERMILPV:
58768 if (!IsSplat && (VT.is256BitVector() ||
58769 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58770 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58771 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58772 if (Concat0 || Concat1)
58773 return DAG.getNode(Opcode, DL, VT,
58774 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58775 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58776 }
58777 break;
58778 case X86ISD::PSHUFB:
58779 case X86ISD::PSADBW:
58780 case X86ISD::VPMADDUBSW:
58781 case X86ISD::VPMADDWD:
58782 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58783 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58784 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58785 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58786 NumOps * SrcVT.getVectorNumElements());
58787 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58788 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58789 if (Concat0 || Concat1)
58790 return DAG.getNode(
58791 Opcode, DL, VT,
58792 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58793 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58794 }
58795 break;
58796 case X86ISD::VPERMV:
58797 // TODO: Handle 256-bit and NumOps == 4 cases.
58798 if (!IsSplat && NumOps == 2 &&
58799 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58800 MVT OpVT = Op0.getSimpleValueType();
58801 int NumSrcElts = OpVT.getVectorNumElements();
58802 SmallVector<int, 64> ConcatMask;
58803 for (unsigned i = 0; i != NumOps; ++i) {
58804 SmallVector<int, 64> SubMask;
58806 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58807 break;
58808 for (int M : SubMask) {
58809 if (0 <= M)
58810 M += i * NumSrcElts;
58811 ConcatMask.push_back(M);
58812 }
58813 }
58814 if (ConcatMask.size() == (NumOps * NumSrcElts))
58815 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58816 ConcatSubOperand(VT, Ops, 1),
58817 DAG.getUNDEF(VT), Subtarget, DAG);
58818 }
58819 break;
58820 case X86ISD::VPERMV3:
58821 // TODO: Handle 256-bit and NumOps == 4 cases.
58822 if (!IsSplat && NumOps == 2 &&
58823 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58824 MVT OpVT = Op0.getSimpleValueType();
58825 int NumSrcElts = OpVT.getVectorNumElements();
58826 SmallVector<int, 64> ConcatMask;
58827 for (unsigned i = 0; i != NumOps; ++i) {
58828 SmallVector<int, 64> SubMask;
58830 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58831 break;
58832 for (int M : SubMask) {
58833 if (0 <= M) {
58834 int Src = M < NumSrcElts ? 0 : 2;
58835 M += M < NumSrcElts ? 0 : NumSrcElts;
58836
58837 // Reference the lowest sub if the upper sub is the same.
58838 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58839 M += i * NumSrcElts;
58840 }
58841 ConcatMask.push_back(M);
58842 }
58843 }
58844 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58845 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58846 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58847 if (Concat0 || Concat1)
58848 return lowerShuffleWithPERMV(
58849 DL, VT, ConcatMask,
58850 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58851 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58852 DAG);
58853 }
58854 }
58855 break;
58856 case X86ISD::VPERM2X128: {
58857 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58858 assert(NumOps == 2 && "Bad concat_vectors operands");
58859 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58860 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58861 // TODO: Handle zero'd subvectors.
58862 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58863 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58864 (int)((Imm1 >> 4) & 0x3)};
58865 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58866 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58867 Ops[0].getOperand(1), DAG, DL);
58868 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58869 Ops[1].getOperand(1), DAG, DL);
58870 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58871 DAG.getBitcast(ShuffleVT, LHS),
58872 DAG.getBitcast(ShuffleVT, RHS),
58873 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58874 return DAG.getBitcast(VT, Res);
58875 }
58876 }
58877 break;
58878 }
58879 case X86ISD::SHUF128: {
58880 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58881 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58882 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58883 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58884 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58885 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58886 Ops[0].getOperand(1), DAG, DL);
58887 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58888 Ops[1].getOperand(1), DAG, DL);
58889 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58890 DAG.getTargetConstant(Imm, DL, MVT::i8));
58891 }
58892 break;
58893 }
58894 case ISD::TRUNCATE:
58895 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58896 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58897 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58898 SrcVT == Ops[1].getOperand(0).getValueType() &&
58899 Subtarget.useAVX512Regs() &&
58900 Subtarget.getPreferVectorWidth() >= 512 &&
58901 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58902 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58903 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58904 ConcatSubOperand(NewSrcVT, Ops, 0));
58905 }
58906 }
58907 break;
58908 case ISD::ANY_EXTEND:
58909 case ISD::SIGN_EXTEND:
58910 case ISD::ZERO_EXTEND:
58911 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58912 if (!IsSplat && NumOps == 2 &&
58913 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58914 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58915 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58916 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58917 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58918 SrcVT == Ops[1].getOperand(0).getValueType()) {
58919 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58920 return DAG.getNode(Opcode, DL, VT,
58921 ConcatSubOperand(NewSrcVT, Ops, 0));
58922 }
58923 }
58924 break;
58928 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58929 if (!IsSplat && NumOps == 2 &&
58930 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58931 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58932 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58934 Op0.getOperand(0).getValueType() ==
58935 Ops[0].getOperand(0).getValueType()) {
58936 EVT SrcVT = Op0.getOperand(0).getValueType();
58937 unsigned NumElts = VT.getVectorNumElements();
58938 MVT UnpackSVT =
58939 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58940 MVT UnpackVT =
58941 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58942 SDValue Unpack =
58943 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58944 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58945 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58946 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58947 DAG.getBitcast(SrcVT, Unpack), DAG);
58948 }
58949 break;
58950 }
58951 case X86ISD::VSHLI:
58952 case X86ISD::VSRLI:
58953 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58954 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58955 llvm::all_of(Ops, [](SDValue Op) {
58956 return Op.getConstantOperandAPInt(1) == 32;
58957 })) {
58958 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58959 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58960 Res = DAG.getBitcast(MVT::v8i32, Res);
58961 if (Opcode == X86ISD::VSHLI) {
58962 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58963 {8, 0, 8, 2, 8, 4, 8, 6});
58964 } else {
58965 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58966 {1, 8, 3, 8, 5, 8, 7, 8});
58967 }
58968 return DAG.getBitcast(VT, Res);
58969 }
58970 }
58971 [[fallthrough]];
58972 case X86ISD::VSRAI:
58973 case X86ISD::VSHL:
58974 case X86ISD::VSRL:
58975 case X86ISD::VSRA:
58976 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58977 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58978 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58979 llvm::all_of(Ops, [Op0](SDValue Op) {
58980 return Op0.getOperand(1) == Op.getOperand(1);
58981 })) {
58982 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58983 Op0.getOperand(1));
58984 }
58985 break;
58986 case X86ISD::VPERMI:
58987 case X86ISD::VROTLI:
58988 case X86ISD::VROTRI:
58989 if (!IsSplat &&
58990 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58991 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58992 llvm::all_of(Ops, [Op0](SDValue Op) {
58993 return Op0.getOperand(1) == Op.getOperand(1);
58994 })) {
58995 assert(!(Opcode == X86ISD::VPERMI &&
58996 Op0.getValueType().is128BitVector()) &&
58997 "Illegal 128-bit X86ISD::VPERMI nodes");
58998 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58999 Op0.getOperand(1));
59000 }
59001 break;
59002 case ISD::AND:
59003 case ISD::OR:
59004 case ISD::XOR:
59005 case X86ISD::ANDNP:
59006 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59007 if (!IsSplat && (VT.is256BitVector() ||
59008 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59009 // Don't concatenate root AVX1 NOT patterns.
59010 // TODO: Allow NOT folding if Concat0 succeeds.
59011 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59012 llvm::all_of(Ops, [](SDValue X) {
59013 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59014 }))
59015 break;
59016 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59017 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59018 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59019 return DAG.getNode(Opcode, DL, VT,
59020 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59021 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59022 }
59023 break;
59024 case X86ISD::PCMPEQ:
59025 case X86ISD::PCMPGT:
59026 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59027 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59028 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59029 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59030 if (Concat0 || Concat1)
59031 return DAG.getNode(Opcode, DL, VT,
59032 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59033 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59034 break;
59035 }
59036
59037 if (!IsSplat && VT == MVT::v8i32) {
59038 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59039 // TODO: Handle v4f64 as well?
59040 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59041 for (unsigned I = 0; I != NumOps; ++I) {
59042 MaxSigBitsLHS =
59043 std::max(MaxSigBitsLHS,
59044 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59045 MaxSigBitsRHS =
59046 std::max(MaxSigBitsRHS,
59047 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59048 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59049 break;
59050 }
59051
59052 ISD::CondCode ICC =
59053 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59054 ISD::CondCode FCC =
59056
59057 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59058 MVT FpVT = VT.changeVectorElementType(FpSVT);
59059
59060 if (std::optional<unsigned> CastOpc =
59061 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59062 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59063 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59064 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59065 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59066 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59067 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59068
59069 bool IsAlwaysSignaling;
59070 unsigned FSETCC =
59071 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59072 return DAG.getBitcast(
59073 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59074 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59075 }
59076 }
59077 break;
59078 case ISD::CTPOP:
59079 case ISD::CTTZ:
59080 case ISD::CTLZ:
59083 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59084 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59085 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59086 }
59087 break;
59089 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59090 if (!IsSplat &&
59091 (VT.is256BitVector() ||
59092 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59093 llvm::all_of(Ops, [Op0](SDValue Op) {
59094 return Op0.getOperand(2) == Op.getOperand(2);
59095 })) {
59096 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59097 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59098 }
59099 break;
59100 case ISD::ADD:
59101 case ISD::SUB:
59102 case ISD::MUL:
59103 // TODO: Add more integer binops?
59104 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59105 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59106 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59107 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59108 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59109 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59110 return Op.getOperand(0) == Op.getOperand(1);
59111 }))
59112 return DAG.getNode(Opcode, DL, VT,
59113 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59114 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59115 }
59116 break;
59117 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59118 // their latency are short, so here we don't replace them unless we won't
59119 // introduce extra VINSERT.
59120 case ISD::FADD:
59121 case ISD::FSUB:
59122 case ISD::FMUL:
59123 if (!IsSplat && (VT.is256BitVector() ||
59124 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59125 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59126 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59127 if (Concat0 || Concat1)
59128 return DAG.getNode(Opcode, DL, VT,
59129 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59130 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59131 }
59132 break;
59133 // Always prefer to concatenate high latency FDIV instructions.
59134 case ISD::FDIV:
59135 if (!IsSplat && (VT.is256BitVector() ||
59136 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59137 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59138 ConcatSubOperand(VT, Ops, 1));
59139 }
59140 break;
59141 case X86ISD::HADD:
59142 case X86ISD::HSUB:
59143 case X86ISD::FHADD:
59144 case X86ISD::FHSUB:
59145 if (!IsSplat && VT.is256BitVector() &&
59146 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59147 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59148 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59149 if (Concat0 || Concat1)
59150 return DAG.getNode(Opcode, DL, VT,
59151 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59152 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59153 }
59154 break;
59155 case X86ISD::PACKSS:
59156 case X86ISD::PACKUS:
59157 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59158 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59159 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59160 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59161 NumOps * SrcVT.getVectorNumElements());
59162 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59163 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59164 if (Concat0 || Concat1)
59165 return DAG.getNode(
59166 Opcode, DL, VT,
59167 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59168 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59169 }
59170 break;
59171 case X86ISD::VSHLD:
59172 case X86ISD::VSHRD:
59173 case X86ISD::PALIGNR:
59174 if (!IsSplat &&
59175 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59176 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59177 llvm::all_of(Ops, [Op0](SDValue Op) {
59178 return Op0.getOperand(2) == Op.getOperand(2);
59179 })) {
59180 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59181 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59182 if (Concat0 || Concat1)
59183 return DAG.getNode(Opcode, DL, VT,
59184 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59185 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59186 Op0.getOperand(2));
59187 }
59188 break;
59189 case X86ISD::BLENDI:
59190 if (VT.is256BitVector() && NumOps == 2 &&
59191 (EltSizeInBits >= 32 ||
59192 (Subtarget.hasInt256() &&
59193 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59194 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59195 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59196 if (Concat0 || Concat1) {
59197 unsigned NumElts = VT.getVectorNumElements();
59198 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59199 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59200 Mask = Mask.zextOrTrunc(8);
59201 return DAG.getNode(Opcode, DL, VT,
59202 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59203 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59204 DAG.getTargetConstant(Mask, DL, MVT::i8));
59205 }
59206 }
59207 // TODO: BWI targets should only use CombineSubOperand.
59208 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59209 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59210 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59211 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59212 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59213 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59214 unsigned NumElts = VT.getVectorNumElements();
59215 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59216 for (unsigned I = 1; I != NumOps; ++I)
59217 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59218 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59219 Mask = Mask.zextOrTrunc(NumMaskBits);
59220 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59221 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59222 SDValue Sel =
59223 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59224 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59225 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59226 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59227 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59228 }
59229 }
59230 break;
59231 case ISD::VSELECT:
59232 // TODO: VSELECT should use CombineSubOperand.
59233 if (!IsSplat && Subtarget.hasAVX512() &&
59234 (VT.is256BitVector() ||
59235 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59236 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59237 EVT SelVT = Ops[0].getOperand(0).getValueType();
59238 if (SelVT.getVectorElementType() == MVT::i1) {
59239 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59240 NumOps * SelVT.getVectorNumElements());
59241 if (TLI.isTypeLegal(SelVT))
59242 return DAG.getNode(
59243 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59244 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59245 }
59246 }
59247 [[fallthrough]];
59248 case X86ISD::BLENDV:
59249 // TODO: BLENDV should use CombineSubOperand.
59250 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59251 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59252 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59253 EVT SelVT = Ops[0].getOperand(0).getValueType();
59254 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59255 if (TLI.isTypeLegal(SelVT))
59256 return DAG.getNode(
59257 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59258 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59259 }
59260 break;
59261 }
59262 }
59263
59264 // Fold subvector loads into one.
59265 // If needed, look through bitcasts to get to the load.
59266 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59267 unsigned Fast;
59268 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59269 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59270 *FirstLd->getMemOperand(), &Fast) &&
59271 Fast) {
59272 if (SDValue Ld =
59273 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59274 return Ld;
59275 }
59276 }
59277
59278 // Attempt to fold target constant loads.
59279 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59280 SmallVector<APInt> EltBits;
59281 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59282 for (unsigned I = 0; I != NumOps; ++I) {
59283 APInt OpUndefElts;
59284 SmallVector<APInt> OpEltBits;
59285 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59286 OpEltBits, /*AllowWholeUndefs*/ true,
59287 /*AllowPartialUndefs*/ false))
59288 break;
59289 EltBits.append(OpEltBits);
59290 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59291 }
59292 if (EltBits.size() == VT.getVectorNumElements()) {
59293 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59294 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59295 SDValue CV = DAG.getConstantPool(C, PVT);
59298 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59299 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59301 return Ld;
59302 }
59303 }
59304
59305 // If this simple subvector or scalar/subvector broadcast_load is inserted
59306 // into both halves, use a larger broadcast_load. Update other uses to use
59307 // an extracted subvector.
59308 if (IsSplat &&
59309 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59310 if (ISD::isNormalLoad(Op0.getNode()) ||
59313 auto *Mem = cast<MemSDNode>(Op0);
59314 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59317 if (SDValue BcastLd =
59318 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59319 SDValue BcastSrc =
59320 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59321 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59322 return BcastLd;
59323 }
59324 }
59325 }
59326
59327 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59328 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59329 Subtarget.useAVX512Regs()) {
59330 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59331 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59332 Res = DAG.getBitcast(ShuffleVT, Res);
59333 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59334 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59335 return DAG.getBitcast(VT, Res);
59336 }
59337
59338 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59339 if (!IsSplat &&
59340 ((NumOps == 2 && VT == MVT::v4f64) ||
59341 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59342 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59343 // Collect the individual per-lane v2f64/v4f64 shuffles.
59344 MVT OpVT = Ops[0].getSimpleValueType();
59345 unsigned NumOpElts = OpVT.getVectorNumElements();
59348 if (all_of(seq<int>(NumOps), [&](int I) {
59349 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59350 Depth + 1) &&
59351 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59352 none_of(SrcMasks[I], isUndefOrZero) &&
59353 SrcMasks[I].size() == NumOpElts &&
59354 all_of(SrcOps[I], [&OpVT](SDValue V) {
59355 return V.getValueType() == OpVT;
59356 });
59357 })) {
59358 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59359 bool Unary = true;
59360 unsigned SHUFPDMask = 0;
59362 for (unsigned I = 0; I != NumOps; ++I) {
59363 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59364 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59365 Unary &= LHS[I] == RHS[I];
59366 for (unsigned J = 0; J != NumOpElts; ++J)
59367 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59368 }
59369 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59370 // PERMILPD mask and we can always profitably concatenate them.
59371 SDValue Concat0 =
59372 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59373 SDValue Concat1 =
59374 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59375 if (Unary || Concat0 || Concat1) {
59376 Concat0 =
59377 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59378 Concat1 =
59379 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59380 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59381 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59382 }
59383 }
59384 }
59385
59386 return SDValue();
59387}
59388
59391 const X86Subtarget &Subtarget) {
59392 EVT VT = N->getValueType(0);
59393 EVT SrcVT = N->getOperand(0).getValueType();
59394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59396
59397 if (VT.getVectorElementType() == MVT::i1) {
59398 // Attempt to constant fold.
59399 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59401 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59403 if (!C) break;
59404 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59405 if (I == (E - 1)) {
59406 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59407 if (TLI.isTypeLegal(IntVT))
59408 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59409 }
59410 }
59411
59412 // Don't do anything else for i1 vectors.
59413 return SDValue();
59414 }
59415
59416 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59417 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59418 Subtarget))
59419 return R;
59420 }
59421
59422 return SDValue();
59423}
59424
59427 const X86Subtarget &Subtarget) {
59428 if (DCI.isBeforeLegalizeOps())
59429 return SDValue();
59430
59431 MVT OpVT = N->getSimpleValueType(0);
59432
59433 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59434
59435 SDLoc dl(N);
59436 SDValue Vec = N->getOperand(0);
59437 SDValue SubVec = N->getOperand(1);
59438
59439 uint64_t IdxVal = N->getConstantOperandVal(2);
59440 MVT SubVecVT = SubVec.getSimpleValueType();
59441 int VecNumElts = OpVT.getVectorNumElements();
59442 int SubVecNumElts = SubVecVT.getVectorNumElements();
59443
59444 if (Vec.isUndef() && SubVec.isUndef())
59445 return DAG.getUNDEF(OpVT);
59446
59447 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59448 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59449 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59450 return getZeroVector(OpVT, Subtarget, DAG, dl);
59451
59453 // If we're inserting into a zero vector and then into a larger zero vector,
59454 // just insert into the larger zero vector directly.
59455 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59457 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59458 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59459 getZeroVector(OpVT, Subtarget, DAG, dl),
59460 SubVec.getOperand(1),
59461 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59462 }
59463
59464 // If we're inserting into a zero vector and our input was extracted from an
59465 // insert into a zero vector of the same type and the extraction was at
59466 // least as large as the original insertion. Just insert the original
59467 // subvector into a zero vector.
59468 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59469 isNullConstant(SubVec.getOperand(1)) &&
59471 SDValue Ins = SubVec.getOperand(0);
59472 if (isNullConstant(Ins.getOperand(2)) &&
59473 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59474 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59475 SubVecVT.getFixedSizeInBits())
59476 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59477 getZeroVector(OpVT, Subtarget, DAG, dl),
59478 Ins.getOperand(1), N->getOperand(2));
59479 }
59480 }
59481
59482 // Stop here if this is an i1 vector.
59483 if (IsI1Vector)
59484 return SDValue();
59485
59486 // Eliminate an intermediate vector widening:
59487 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59488 // insert_subvector X, Y, Idx
59489 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59490 // there?
59491 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59492 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59493 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59494 SubVec.getOperand(1), N->getOperand(2));
59495
59496 // If this is an insert of an extract, combine to a shuffle. Don't do this
59497 // if the insert or extract can be represented with a subregister operation.
59498 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59499 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59500 (IdxVal != 0 ||
59501 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59502 SDValue ExtSrc = SubVec.getOperand(0);
59503 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59504 // Create a shuffle mask matching the extraction and insertion.
59505 SmallVector<int, 64> Mask(VecNumElts);
59506 std::iota(Mask.begin(), Mask.end(), 0);
59507 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59508 ExtIdxVal + VecNumElts);
59509 if (ExtIdxVal != 0)
59510 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59511 // See if we can use a blend instead of extract/insert pair.
59512 SmallVector<int, 64> BlendMask(VecNumElts);
59513 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59514 std::iota(BlendMask.begin() + IdxVal,
59515 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59516 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59517 VecNumElts == (2 * SubVecNumElts)) {
59518 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59519 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59520 SDValue Blend = DAG.getNode(
59521 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59522 DAG.getBitcast(MVT::v8f32, ExtSrc),
59523 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59524 return DAG.getBitcast(OpVT, Blend);
59525 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59526 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59527 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59528 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59529 SDValue Shuffle =
59530 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59531 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59532 return DAG.getBitcast(OpVT, Shuffle);
59533 }
59534 }
59535 }
59536
59537 // Match concat_vector style patterns.
59538 SmallVector<SDValue, 2> SubVectorOps;
59539 if (collectConcatOps(N, SubVectorOps, DAG)) {
59540 if (SDValue Fold =
59541 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59542 return Fold;
59543
59544 // If we're inserting all zeros into the upper half, change this to
59545 // a concat with zero. We will match this to a move
59546 // with implicit upper bit zeroing during isel.
59547 // We do this here because we don't want combineConcatVectorOps to
59548 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59549 if (SubVectorOps.size() == 2 &&
59550 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59551 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59552 getZeroVector(OpVT, Subtarget, DAG, dl),
59553 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59554
59555 // Attempt to recursively combine to a shuffle.
59556 if (all_of(SubVectorOps, [](SDValue SubOp) {
59558 })) {
59559 SDValue Op(N, 0);
59560 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59561 return Res;
59562 }
59563 }
59564
59565 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59566 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59567 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59568
59569 // If this is a broadcast load inserted into an upper undef, use a larger
59570 // broadcast load.
59571 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59572 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59573 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59575 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59576 }
59577
59578 // If we're splatting the lower half subvector of a full vector load into the
59579 // upper half, attempt to create a subvector broadcast.
59580 if ((int)IdxVal == (VecNumElts / 2) &&
59581 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59582 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59583 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59584 if (VecLd && SubLd &&
59586 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59588 SubVecVT, SubLd, 0, DAG);
59589 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59590 BcastLd, DAG.getVectorIdxConstant(0, dl));
59591 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59592 return BcastLd;
59593 }
59594 }
59595
59596 // Attempt to constant fold (if we're not widening).
59597 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59598 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59599 APInt VecUndefElts, SubUndefElts;
59600 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59601 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59602 VecEltBits) &&
59603 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59604 SubEltBits)) {
59605 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59606 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59607 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59608 }
59609 }
59610
59611 // Attempt to recursively combine to a shuffle.
59614 SDValue Op(N, 0);
59615 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59616 return Res;
59617 }
59618
59619 // Match insertion of subvector load that perfectly aliases a base load.
59620 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59621 ISD::isNormalLoad(SubVec.getNode()) &&
59623 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59624 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59625 return Vec;
59626
59627 return SDValue();
59628}
59629
59630/// If we are extracting a subvector of a vector select and the select condition
59631/// is composed of concatenated vectors, try to narrow the select width. This
59632/// is a common pattern for AVX1 integer code because 256-bit selects may be
59633/// legal, but there is almost no integer math/logic available for 256-bit.
59634/// This function should only be called with legal types (otherwise, the calls
59635/// to get simple value types will assert).
59637 SelectionDAG &DAG) {
59638 SDValue Sel = Ext->getOperand(0);
59639 if (Sel.getOpcode() != ISD::VSELECT ||
59640 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59641 return SDValue();
59642
59643 // Note: We assume simple value types because this should only be called with
59644 // legal operations/types.
59645 // TODO: This can be extended to handle extraction to 256-bits.
59646 MVT VT = Ext->getSimpleValueType(0);
59647 if (!VT.is128BitVector())
59648 return SDValue();
59649
59650 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59651 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59652 return SDValue();
59653
59654 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59655 MVT SelVT = Sel.getSimpleValueType();
59656 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59657 "Unexpected vector type with legal operations");
59658
59659 unsigned SelElts = SelVT.getVectorNumElements();
59660 unsigned CastedElts = WideVT.getVectorNumElements();
59661 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59662 if (SelElts % CastedElts == 0) {
59663 // The select has the same or more (narrower) elements than the extract
59664 // operand. The extraction index gets scaled by that factor.
59665 ExtIdx *= (SelElts / CastedElts);
59666 } else if (CastedElts % SelElts == 0) {
59667 // The select has less (wider) elements than the extract operand. Make sure
59668 // that the extraction index can be divided evenly.
59669 unsigned IndexDivisor = CastedElts / SelElts;
59670 if (ExtIdx % IndexDivisor != 0)
59671 return SDValue();
59672 ExtIdx /= IndexDivisor;
59673 } else {
59674 llvm_unreachable("Element count of simple vector types are not divisible?");
59675 }
59676
59677 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59678 unsigned NarrowElts = SelElts / NarrowingFactor;
59679 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59680 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59681 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59682 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59683 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59684 return DAG.getBitcast(VT, NarrowSel);
59685}
59686
59689 const X86Subtarget &Subtarget) {
59690 if (!N->getValueType(0).isSimple())
59691 return SDValue();
59692
59693 MVT VT = N->getSimpleValueType(0);
59694 SDValue InVec = N->getOperand(0);
59695 unsigned IdxVal = N->getConstantOperandVal(1);
59696 EVT InVecVT = InVec.getValueType();
59697 unsigned SizeInBits = VT.getSizeInBits();
59698 unsigned InSizeInBits = InVecVT.getSizeInBits();
59699 unsigned NumSubElts = VT.getVectorNumElements();
59700 unsigned NumInElts = InVecVT.getVectorNumElements();
59701 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59702 SDLoc DL(N);
59703
59704 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59705 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59706 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59707 // We let generic combining take over from there to simplify the
59708 // insert/extract and 'not'.
59709 // This pattern emerges during AVX1 legalization. We handle it before lowering
59710 // to avoid complications like splitting constant vector loads.
59711 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59712 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59713 auto isConcatenatedNot = [](SDValue V) {
59714 V = peekThroughBitcasts(V);
59715 if (!isBitwiseNot(V))
59716 return false;
59717 SDValue NotOp = V->getOperand(0);
59719 };
59720 if (isConcatenatedNot(InVec.getOperand(0)) ||
59721 isConcatenatedNot(InVec.getOperand(1))) {
59722 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59723 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59724 splitVectorIntBinary(InVec, DAG, DL),
59725 N->getOperand(1));
59726 }
59727 }
59728
59729 if (DCI.isBeforeLegalizeOps())
59730 return SDValue();
59731
59732 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59733 return V;
59734
59736 return getZeroVector(VT, Subtarget, DAG, DL);
59737
59738 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59739 if (VT.getScalarType() == MVT::i1)
59740 return DAG.getConstant(1, DL, VT);
59741 return getOnesVector(VT, DAG, DL);
59742 }
59743
59744 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59745 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59746
59747 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59748 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59749 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59750 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59751 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59752 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59753 }
59754
59755 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59756 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59757 // iff SUB is entirely contained in the extraction.
59758 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59759 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59760 SDValue Src = InVec.getOperand(0);
59761 SDValue Sub = InVec.getOperand(1);
59762 EVT SubVT = Sub.getValueType();
59763 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59764 if (IdxVal <= InsIdx &&
59765 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59766 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59767 DAG.getVectorIdxConstant(IdxVal, DL));
59768 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59769 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59770 }
59771 }
59772
59773 // If we're extracting an upper subvector see if we'd get the same elements if
59774 // we extracted the lowest subvector instead which should allow
59775 // SimplifyDemandedVectorElts do more simplifications.
59776 if (IdxVal != 0) {
59777 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59778 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59779 });
59780 if (AllEquiv)
59781 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59782 }
59783
59784 // Check if we're extracting a whole broadcasted subvector.
59785 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59786 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59787 EVT MemVT = MemIntr->getMemoryVT();
59788 if (MemVT == VT) {
59789 // If this is the only use, we can replace with a regular load (this may
59790 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59791 // memory chain).
59792 if (InVec.hasOneUse()) {
59793 SDValue Ld =
59794 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59795 MemIntr->getMemOperand());
59796 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59797 return Ld;
59798 }
59799 }
59800 }
59801
59802 // Attempt to extract from the source of a shuffle vector.
59803 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59804 SmallVector<int, 32> ShuffleMask;
59805 SmallVector<int, 32> ScaledMask;
59806 SmallVector<SDValue, 2> ShuffleInputs;
59807 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59808 // Decode the shuffle mask and scale it so its shuffling subvectors.
59809 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59810 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59811 unsigned SubVecIdx = IdxVal / NumSubElts;
59812 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59813 return DAG.getUNDEF(VT);
59814 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59815 return getZeroVector(VT, Subtarget, DAG, DL);
59816 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59817 if (Src.getValueSizeInBits() == InSizeInBits) {
59818 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59819 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59820 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59821 DL, SizeInBits);
59822 }
59823 }
59824 }
59825
59826 auto IsExtractFree = [](SDValue V) {
59827 if (V.hasOneUse()) {
59829 if (V.getOpcode() == ISD::LOAD)
59830 return true;
59831 }
59832 V = peekThroughBitcasts(V);
59833 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59834 return true;
59836 return true;
59837 return V.isUndef();
59838 };
59839
59840 // If we're extracting the lowest subvector and we're the only user,
59841 // we may be able to perform this with a smaller vector width.
59842 unsigned InOpcode = InVec.getOpcode();
59843 if (InVec.hasOneUse()) {
59844 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59845 // v2f64 CVTDQ2PD(v4i32).
59846 if (InOpcode == ISD::SINT_TO_FP &&
59847 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59848 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59849 }
59850 // v2f64 CVTUDQ2PD(v4i32).
59851 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59852 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59853 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59854 }
59855 // v2f64 CVTPS2PD(v4f32).
59856 if (InOpcode == ISD::FP_EXTEND &&
59857 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59858 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59859 }
59860 }
59861 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59862 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59863 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59864 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59865 Subtarget.hasVLX())) &&
59866 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59867 SDValue Src = InVec.getOperand(0);
59868 if (Src.getValueType().getScalarSizeInBits() == 32)
59869 return DAG.getNode(InOpcode, DL, VT,
59870 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59871 }
59872 if (IdxVal == 0 &&
59873 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59874 (SizeInBits == 128 || SizeInBits == 256) &&
59875 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59876 SDValue Ext = InVec.getOperand(0);
59877 if (Ext.getValueSizeInBits() > SizeInBits)
59878 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59879 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59880 return DAG.getNode(ExtOp, DL, VT, Ext);
59881 }
59882 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59883 InVec.getOperand(0).getValueType().is256BitVector() &&
59884 InVec.getOperand(1).getValueType().is256BitVector() &&
59885 InVec.getOperand(2).getValueType().is256BitVector()) {
59886 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59887 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59888 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59889 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59890 }
59891 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59892 (SizeInBits == 128 || SizeInBits == 256)) {
59893 SDValue InVecSrc = InVec.getOperand(0);
59894 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59895 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59896 return DAG.getNode(InOpcode, DL, VT, Ext);
59897 }
59898
59899 if (SizeInBits == 128 || SizeInBits == 256) {
59900 switch (InOpcode) {
59901 case X86ISD::MOVDDUP:
59902 return DAG.getNode(
59903 InOpcode, DL, VT,
59904 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59905 case X86ISD::PSHUFD:
59906 case X86ISD::VPERMILPI:
59907 if (InVec.getOperand(0).hasOneUse()) {
59908 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59909 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59910 return DAG.getNode(InOpcode, DL, VT,
59911 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59912 DL, SizeInBits),
59913 DAG.getTargetConstant(M, DL, MVT::i8));
59914 }
59915 break;
59916 case X86ISD::PCMPEQ:
59917 case X86ISD::PCMPGT:
59918 case X86ISD::UNPCKH:
59919 case X86ISD::UNPCKL:
59920 if (IsExtractFree(InVec.getOperand(0)) ||
59921 IsExtractFree(InVec.getOperand(1)))
59922 return DAG.getNode(InOpcode, DL, VT,
59923 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59924 DL, SizeInBits),
59925 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59926 DL, SizeInBits));
59927 break;
59928 case X86ISD::CMPP:
59929 if (IsExtractFree(InVec.getOperand(0)) ||
59930 IsExtractFree(InVec.getOperand(1)))
59931 return DAG.getNode(InOpcode, DL, VT,
59932 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59933 DL, SizeInBits),
59934 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59935 DL, SizeInBits),
59936 InVec.getOperand(2));
59937 break;
59938 case X86ISD::BLENDI:
59939 if (IsExtractFree(InVec.getOperand(0)) ||
59940 IsExtractFree(InVec.getOperand(1))) {
59941 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59942 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59943 return DAG.getNode(InOpcode, DL, VT,
59944 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59945 DL, SizeInBits),
59946 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59947 DL, SizeInBits),
59948 DAG.getTargetConstant(M, DL, MVT::i8));
59949 }
59950 break;
59951 case X86ISD::VPERMV:
59952 if (IdxVal != 0) {
59953 SDValue Mask = InVec.getOperand(0);
59954 SDValue Src = InVec.getOperand(1);
59955 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59956 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59957 DL, InSizeInBits);
59958 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59959 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59960 }
59961 break;
59962 case X86ISD::VPERMV3:
59963 if (IdxVal != 0) {
59964 SDValue Src0 = InVec.getOperand(0);
59965 SDValue Mask = InVec.getOperand(1);
59966 SDValue Src1 = InVec.getOperand(2);
59967 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59968 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59969 DL, InSizeInBits);
59970 SDValue Shuffle =
59971 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59972 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59973 }
59974 break;
59975 }
59976 }
59977 }
59978
59979 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59980 // as this is very likely to fold into a shuffle/truncation.
59981 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59982 InVecVT.getScalarSizeInBits() == 64 &&
59983 InVec.getConstantOperandAPInt(1) == 32) {
59984 SDValue Ext =
59985 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59986 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59987 }
59988
59989 return SDValue();
59990}
59991
59993 const X86Subtarget &Subtarget) {
59994 using namespace SDPatternMatch;
59995 EVT VT = N->getValueType(0);
59996 SDValue Src = N->getOperand(0);
59997 SDLoc DL(N);
59998
59999 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60000 // This occurs frequently in our masked scalar intrinsic code and our
60001 // floating point select lowering with AVX512.
60002 // TODO: SimplifyDemandedBits instead?
60003 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60004 isOneConstant(Src.getOperand(1)))
60005 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60006
60007 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60008 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60009 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60010 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60011 isNullConstant(Src.getOperand(1)))
60012 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60013 Src.getOperand(1));
60014
60015 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60016 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60017 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60018 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60019 if (Op.getValueType() != MVT::i64)
60020 return SDValue();
60021 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60022 if (Op.getOpcode() == Opc &&
60023 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60024 return Op.getOperand(0);
60025 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60026 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60027 if (Ld->getExtensionType() == Ext &&
60028 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60029 return Op;
60030 if (IsZeroExt) {
60031 KnownBits Known = DAG.computeKnownBits(Op);
60032 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60033 return Op;
60034 }
60035 return SDValue();
60036 };
60037
60038 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60039 return DAG.getBitcast(
60040 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60041 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60042
60043 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60044 return DAG.getBitcast(
60045 VT,
60046 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60047 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60048 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60049 }
60050
60051 if (Src.getOpcode() == ISD::BITCAST) {
60052 SDValue SrcOp = Src.getOperand(0);
60053 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60054 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60055 return DAG.getBitcast(
60056 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60057 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60058 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60059 return DAG.getBitcast(
60060 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60061 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60062 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60063 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60064 }
60065
60066 if (VT == MVT::v4i32) {
60067 SDValue HalfSrc;
60068 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60069 // to remove XMM->GPR->XMM moves.
60070 if (sd_match(Src, m_AnyExt(m_BitCast(
60071 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60072 return DAG.getBitcast(
60073 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60074 }
60075
60076 // See if we're broadcasting the scalar value, in which case just reuse that.
60077 // Ensure the same SDValue from the SDNode use is being used.
60078 if (VT.getScalarType() == Src.getValueType())
60079 for (SDNode *User : Src->users())
60080 if (User->getOpcode() == X86ISD::VBROADCAST &&
60081 Src == User->getOperand(0)) {
60082 unsigned SizeInBits = VT.getFixedSizeInBits();
60083 unsigned BroadcastSizeInBits =
60084 User->getValueSizeInBits(0).getFixedValue();
60085 if (BroadcastSizeInBits == SizeInBits)
60086 return SDValue(User, 0);
60087 if (BroadcastSizeInBits > SizeInBits)
60088 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60089 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60090 // coverage.
60091 }
60092
60093 // Check for cases where we've ended up with a scalarized shift, typically
60094 // during type legalization.
60095 switch (Src.getOpcode()) {
60096 case ISD::SHL:
60097 case ISD::SRL:
60098 case ISD::SRA:
60099 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60100 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60101 Src.hasOneUse()) {
60102 SDValue SrcVec =
60103 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60104 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60105 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60106 Amt->getZExtValue(), DAG);
60107 }
60108 }
60109 break;
60110 case ISD::FSHL:
60111 case ISD::FSHR:
60112 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60113 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60114 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60115 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60116 Src.hasOneUse()) {
60117 uint64_t AmtVal =
60118 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60119 SDValue SrcVec0 =
60120 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60121 SDValue SrcVec1 =
60122 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60123 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60124 DAG.getConstant(AmtVal, DL, VT));
60125 }
60126 }
60127 break;
60128 }
60129
60130 return SDValue();
60131}
60132
60133// Simplify PMULDQ and PMULUDQ operations.
60136 const X86Subtarget &Subtarget) {
60137 SDValue LHS = N->getOperand(0);
60138 SDValue RHS = N->getOperand(1);
60139
60140 // Canonicalize constant to RHS.
60143 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60144
60145 // Multiply by zero.
60146 // Don't return RHS as it may contain UNDEFs.
60147 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60148 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60149
60150 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60151 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60152 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60153 return SDValue(N, 0);
60154
60155 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60156 // convert it to any_extend_invec, due to the LegalOperations check, do the
60157 // conversion directly to a vector shuffle manually. This exposes combine
60158 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60159 // combineX86ShufflesRecursively on SSE4.1 targets.
60160 // FIXME: This is basically a hack around several other issues related to
60161 // ANY_EXTEND_VECTOR_INREG.
60162 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60163 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60164 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60165 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60166 SDLoc dl(N);
60167 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60168 LHS.getOperand(0), { 0, -1, 1, -1 });
60169 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60170 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60171 }
60172 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60173 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60174 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60175 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60176 SDLoc dl(N);
60177 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60178 RHS.getOperand(0), { 0, -1, 1, -1 });
60179 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60180 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60181 }
60182
60183 return SDValue();
60184}
60185
60186// Simplify VPMADDUBSW/VPMADDWD operations.
60189 MVT VT = N->getSimpleValueType(0);
60190 SDValue LHS = N->getOperand(0);
60191 SDValue RHS = N->getOperand(1);
60192 unsigned Opc = N->getOpcode();
60193 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60195 "Unexpected PMADD opcode");
60196
60197 // Multiply by zero.
60198 // Don't return LHS/RHS as it may contain UNDEFs.
60199 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60201 return DAG.getConstant(0, SDLoc(N), VT);
60202
60203 // Constant folding.
60204 APInt LHSUndefs, RHSUndefs;
60205 SmallVector<APInt> LHSBits, RHSBits;
60206 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60207 unsigned DstEltBits = VT.getScalarSizeInBits();
60208 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60209 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60210 SmallVector<APInt> Result;
60211 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60212 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60213 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60214 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60215 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60216 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60217 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60218 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60219 Result.push_back(Res);
60220 }
60221 return getConstVector(Result, VT, DAG, SDLoc(N));
60222 }
60223
60224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60225 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60226 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60227 return SDValue(N, 0);
60228
60229 return SDValue();
60230}
60231
60232// Simplify VPMADD52L/VPMADD52H operations.
60235 MVT VT = N->getSimpleValueType(0);
60236
60237 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60238 SDValue Op0 = N->getOperand(0);
60239 SDValue Op1 = N->getOperand(1);
60240 SDValue Op2 = N->getOperand(2);
60241 SDLoc DL(N);
60242
60243 APInt C0, C1;
60244 bool HasC0 = X86::isConstantSplat(Op0, C0),
60245 HasC1 = X86::isConstantSplat(Op1, C1);
60246
60247 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60248 if (HasC0 && !HasC1)
60249 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60250
60251 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60252 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60253 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60254 if (KnownOp0.countMinLeadingZeros() >= 12)
60255 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60256 }
60257
60258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60259 unsigned NumEltBits = VT.getScalarSizeInBits();
60260 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60261 DCI))
60262 return SDValue(N, 0);
60263
60264 return SDValue();
60265}
60266
60269 const X86Subtarget &Subtarget) {
60270 EVT VT = N->getValueType(0);
60271 SDValue In = N->getOperand(0);
60272 unsigned Opcode = N->getOpcode();
60273 unsigned InOpcode = In.getOpcode();
60274 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60275 SDLoc DL(N);
60276
60277 // Try to merge vector loads and extend_inreg to an extload.
60278 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60279 In.hasOneUse()) {
60280 auto *Ld = cast<LoadSDNode>(In);
60281 if (Ld->isSimple()) {
60282 MVT SVT = In.getSimpleValueType().getVectorElementType();
60285 : ISD::ZEXTLOAD;
60286 EVT MemVT = VT.changeVectorElementType(SVT);
60287 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60288 SDValue Load = DAG.getExtLoad(
60289 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60290 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60291 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60292 return Load;
60293 }
60294 }
60295 }
60296
60297 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60298 if (Opcode == InOpcode)
60299 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60300
60301 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60302 // -> EXTEND_VECTOR_INREG(X).
60303 // TODO: Handle non-zero subvector indices.
60304 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60305 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60306 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60307 In.getValueSizeInBits())
60308 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60309
60310 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60311 // TODO: Move to DAGCombine?
60312 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60313 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60314 In.getValueSizeInBits() == VT.getSizeInBits()) {
60315 unsigned NumElts = VT.getVectorNumElements();
60316 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60317 EVT EltVT = In.getOperand(0).getValueType();
60318 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60319 for (unsigned I = 0; I != NumElts; ++I)
60320 Elts[I * Scale] = In.getOperand(I);
60321 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60322 }
60323
60324 // Attempt to combine as a shuffle on SSE41+ targets.
60325 if (Subtarget.hasSSE41()) {
60326 SDValue Op(N, 0);
60327 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60328 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60329 return Res;
60330 }
60331
60332 return SDValue();
60333}
60334
60337 EVT VT = N->getValueType(0);
60338 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60339 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60340 return DAG.getConstant(0, SDLoc(N), VT);
60341
60342 // Fold kshiftr(extract_subvector(X,C1),C2)
60343 // --> extract_subvector(kshiftr(X,C1+C2),0)
60344 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60345 if (N->getOpcode() == X86ISD::KSHIFTR) {
60346 SDLoc DL(N);
60347 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60348 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60349 SDValue Src = N->getOperand(0).getOperand(0);
60350 uint64_t Amt = N->getConstantOperandVal(1) +
60351 N->getOperand(0).getConstantOperandVal(1);
60352 EVT SrcVT = Src.getValueType();
60353 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60354 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60355 DAG.getTargetConstant(Amt, DL, MVT::i8));
60356 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60357 DAG.getVectorIdxConstant(0, DL));
60358 }
60359 }
60360 }
60361
60362 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60363 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60364 return SDValue(N, 0);
60365
60366 return SDValue();
60367}
60368
60369// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60370// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60371// extra instructions between the conversion due to going to scalar and back.
60373 const X86Subtarget &Subtarget) {
60374 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60375 return SDValue();
60376
60377 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60378 return SDValue();
60379
60380 if (N->getValueType(0) != MVT::f32 ||
60381 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60382 return SDValue();
60383
60384 SDLoc dl(N);
60385 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60386 N->getOperand(0).getOperand(0));
60387 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60388 DAG.getTargetConstant(4, dl, MVT::i32));
60389 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60390 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60391 DAG.getVectorIdxConstant(0, dl));
60392}
60393
60396 const X86Subtarget &Subtarget) {
60397 EVT VT = N->getValueType(0);
60398 bool IsStrict = N->isStrictFPOpcode();
60399 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60400 EVT SrcVT = Src.getValueType();
60401
60402 SDLoc dl(N);
60403 if (SrcVT.getScalarType() == MVT::bf16) {
60404 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60405 !IsStrict && Src.getOperand(0).getValueType() == VT)
60406 return Src.getOperand(0);
60407
60408 if (!SrcVT.isVector())
60409 return SDValue();
60410
60411 assert(!IsStrict && "Strict FP doesn't support BF16");
60412 if (VT.getVectorElementType() == MVT::f64) {
60413 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60414 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60415 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60416 }
60417 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60418 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60419 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60420 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60421 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60422 return DAG.getBitcast(VT, Src);
60423 }
60424
60425 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60426 return SDValue();
60427
60428 if (Subtarget.hasFP16())
60429 return SDValue();
60430
60431 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60432 return SDValue();
60433
60434 if (VT.getVectorElementType() != MVT::f32 &&
60435 VT.getVectorElementType() != MVT::f64)
60436 return SDValue();
60437
60438 unsigned NumElts = VT.getVectorNumElements();
60439 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60440 return SDValue();
60441
60442 // Convert the input to vXi16.
60443 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60444 Src = DAG.getBitcast(IntVT, Src);
60445
60446 // Widen to at least 8 input elements.
60447 if (NumElts < 8) {
60448 unsigned NumConcats = 8 / NumElts;
60449 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60450 : DAG.getConstant(0, dl, IntVT);
60451 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60452 Ops[0] = Src;
60453 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60454 }
60455
60456 // Destination is vXf32 with at least 4 elements.
60457 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60458 std::max(4U, NumElts));
60459 SDValue Cvt, Chain;
60460 if (IsStrict) {
60461 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60462 {N->getOperand(0), Src});
60463 Chain = Cvt.getValue(1);
60464 } else {
60465 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60466 }
60467
60468 if (NumElts < 4) {
60469 assert(NumElts == 2 && "Unexpected size");
60470 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60471 DAG.getVectorIdxConstant(0, dl));
60472 }
60473
60474 if (IsStrict) {
60475 // Extend to the original VT if necessary.
60476 if (Cvt.getValueType() != VT) {
60477 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60478 {Chain, Cvt});
60479 Chain = Cvt.getValue(1);
60480 }
60481 return DAG.getMergeValues({Cvt, Chain}, dl);
60482 }
60483
60484 // Extend to the original VT if necessary.
60485 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60486}
60487
60488// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60491 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60492 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60493 "Unknown broadcast load type");
60494
60495 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60496 SDValue Ptr = MemIntrin->getBasePtr();
60497 SDValue Chain = MemIntrin->getChain();
60498 EVT VT = N->getSimpleValueType(0);
60499 EVT MemVT = MemIntrin->getMemoryVT();
60500
60501 // Look at other users of our base pointer and try to find a wider broadcast.
60502 // The input chain and the size of the memory VT must match.
60503 for (SDNode *User : Ptr->users())
60504 if (User != N && User->getOpcode() == N->getOpcode() &&
60505 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60506 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60507 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60508 MemVT.getSizeInBits() &&
60509 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60511 MemIntrin->isSimple() && "Illegal broadcast load type");
60513 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60514 VT.getSizeInBits());
60515 Extract = DAG.getBitcast(VT, Extract);
60516 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60517 return Extract;
60518 }
60519
60520 return SDValue();
60521}
60522
60524 const X86Subtarget &Subtarget) {
60525 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60526 return SDValue();
60527
60528 bool IsStrict = N->isStrictFPOpcode();
60529 EVT VT = N->getValueType(0);
60530 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60531 EVT SrcVT = Src.getValueType();
60532
60533 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60534 SrcVT.getVectorElementType() != MVT::f32)
60535 return SDValue();
60536
60537 SDLoc dl(N);
60538
60539 SDValue Cvt, Chain;
60540 unsigned NumElts = VT.getVectorNumElements();
60541 if (Subtarget.hasFP16()) {
60542 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60543 // v4f32 (xint_to_fp v4i64))))
60544 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60545 // v8f16 (CVTXI2P v4i64)))
60546 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60547 Src.getNumOperands() == 2) {
60548 SDValue Cvt0, Cvt1;
60549 SDValue Op0 = Src.getOperand(0);
60550 SDValue Op1 = Src.getOperand(1);
60551 bool IsOp0Strict = Op0->isStrictFPOpcode();
60552 if (Op0.getOpcode() != Op1.getOpcode() ||
60553 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60554 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60555 return SDValue();
60556 }
60557 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60558 if (IsStrict) {
60559 assert(IsOp0Strict && "Op0 must be strict node");
60560 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60563 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60564 {Op0.getOperand(0), Op0.getOperand(1)});
60565 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60566 {Op1.getOperand(0), Op1.getOperand(1)});
60567 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60568 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60569 }
60570 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60572 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60573 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60574 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60575 }
60576 return SDValue();
60577 }
60578
60579 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60580 return SDValue();
60581
60582 // Widen to at least 4 input elements.
60583 if (NumElts < 4)
60584 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60585 DAG.getConstantFP(0.0, dl, SrcVT));
60586
60587 // Destination is v8i16 with at least 8 elements.
60588 EVT CvtVT =
60589 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60590 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60591 if (IsStrict) {
60592 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60593 {N->getOperand(0), Src, Rnd});
60594 Chain = Cvt.getValue(1);
60595 } else {
60596 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60597 }
60598
60599 // Extract down to real number of elements.
60600 if (NumElts < 8) {
60602 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60603 DAG.getVectorIdxConstant(0, dl));
60604 }
60605
60606 Cvt = DAG.getBitcast(VT, Cvt);
60607
60608 if (IsStrict)
60609 return DAG.getMergeValues({Cvt, Chain}, dl);
60610
60611 return Cvt;
60612}
60613
60615 SDValue Src = N->getOperand(0);
60616
60617 // Turn MOVDQ2Q+simple_load into an mmx load.
60618 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60619 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60620
60621 if (LN->isSimple()) {
60622 SDValue NewLd =
60623 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60624 LN->getPointerInfo(), LN->getBaseAlign(),
60625 LN->getMemOperand()->getFlags());
60626 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60627 return NewLd;
60628 }
60629 }
60630
60631 return SDValue();
60632}
60633
60636 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60637 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60638 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60639 return SDValue(N, 0);
60640
60641 return SDValue();
60642}
60643
60644// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60645// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60646// use x86mmx instead.
60648 SDLoc dl(N);
60649
60650 bool MadeChange = false, CastReturnVal = false;
60652 for (const SDValue &Arg : N->op_values()) {
60653 if (Arg.getValueType() == MVT::v1i64) {
60654 MadeChange = true;
60655 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60656 } else
60657 Args.push_back(Arg);
60658 }
60659 SDVTList VTs = N->getVTList();
60660 SDVTList NewVTs = VTs;
60661 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60662 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60663 NewVTArr[0] = MVT::x86mmx;
60664 NewVTs = DAG.getVTList(NewVTArr);
60665 MadeChange = true;
60666 CastReturnVal = true;
60667 }
60668
60669 if (MadeChange) {
60670 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60671 if (CastReturnVal) {
60673 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60674 Returns.push_back(Result.getValue(i));
60675 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60676 return DAG.getMergeValues(Returns, dl);
60677 }
60678 return Result;
60679 }
60680 return SDValue();
60681}
60684 if (!DCI.isBeforeLegalize())
60685 return SDValue();
60686
60687 unsigned IntNo = N->getConstantOperandVal(0);
60688 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60689
60690 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60691 return FixupMMXIntrinsicTypes(N, DAG);
60692
60693 return SDValue();
60694}
60695
60698 if (!DCI.isBeforeLegalize())
60699 return SDValue();
60700
60701 unsigned IntNo = N->getConstantOperandVal(1);
60702 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60703
60704 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60705 return FixupMMXIntrinsicTypes(N, DAG);
60706
60707 return SDValue();
60708}
60709
60712 if (!DCI.isBeforeLegalize())
60713 return SDValue();
60714
60715 unsigned IntNo = N->getConstantOperandVal(1);
60716 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60717
60718 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60719 return FixupMMXIntrinsicTypes(N, DAG);
60720
60721 return SDValue();
60722}
60723
60725 DAGCombinerInfo &DCI) const {
60726 SelectionDAG &DAG = DCI.DAG;
60727 switch (N->getOpcode()) {
60728 // clang-format off
60729 default: break;
60731 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60733 case X86ISD::PEXTRW:
60734 case X86ISD::PEXTRB:
60735 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60737 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60739 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60741 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60742 case ISD::VSELECT:
60743 case ISD::SELECT:
60744 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60745 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60746 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60747 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60748 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60749 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60750 case X86ISD::ADD:
60751 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60752 case X86ISD::CLOAD:
60753 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60754 case X86ISD::SBB: return combineSBB(N, DAG);
60755 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60756 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60757 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60758 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60759 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60760 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60761 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60762 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60763 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60764 case ISD::AVGCEILS:
60765 case ISD::AVGCEILU:
60766 case ISD::AVGFLOORS:
60767 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60768 case X86ISD::BEXTR:
60769 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60770 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60771 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60772 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60773 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60775 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60776 case ISD::SINT_TO_FP:
60778 return combineSIntToFP(N, DAG, DCI, Subtarget);
60779 case ISD::UINT_TO_FP:
60781 return combineUIntToFP(N, DAG, Subtarget);
60782 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60783 case ISD::LRINT:
60784 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60785 case ISD::FADD:
60786 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60787 case X86ISD::VFCMULC:
60788 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60789 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60790 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60791 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60792 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60793 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60794 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60795 case X86ISD::FXOR:
60796 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60797 case X86ISD::FMIN:
60798 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60799 case ISD::FMINNUM:
60800 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60801 case X86ISD::CVTSI2P:
60802 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60803 case X86ISD::CVTP2SI:
60804 case X86ISD::CVTP2UI:
60806 case X86ISD::CVTTP2SI:
60808 case X86ISD::CVTTP2UI:
60809 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60811 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60812 case X86ISD::BT: return combineBT(N, DAG, DCI);
60813 case ISD::ANY_EXTEND:
60814 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60815 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60816 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60820 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60821 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60822 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60823 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60824 case X86ISD::PACKSS:
60825 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60826 case X86ISD::HADD:
60827 case X86ISD::HSUB:
60828 case X86ISD::FHADD:
60829 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60830 case X86ISD::VSHL:
60831 case X86ISD::VSRA:
60832 case X86ISD::VSRL:
60833 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60834 case X86ISD::VSHLI:
60835 case X86ISD::VSRAI:
60836 case X86ISD::VSRLI:
60837 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60839 case X86ISD::PINSRB:
60840 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60841 case X86ISD::SHUFP: // Handle all target specific shuffles
60842 case X86ISD::INSERTPS:
60843 case X86ISD::EXTRQI:
60844 case X86ISD::INSERTQI:
60845 case X86ISD::VALIGN:
60846 case X86ISD::PALIGNR:
60847 case X86ISD::VSHLDQ:
60848 case X86ISD::VSRLDQ:
60849 case X86ISD::BLENDI:
60850 case X86ISD::UNPCKH:
60851 case X86ISD::UNPCKL:
60852 case X86ISD::MOVHLPS:
60853 case X86ISD::MOVLHPS:
60854 case X86ISD::PSHUFB:
60855 case X86ISD::PSHUFD:
60856 case X86ISD::PSHUFHW:
60857 case X86ISD::PSHUFLW:
60858 case X86ISD::MOVSHDUP:
60859 case X86ISD::MOVSLDUP:
60860 case X86ISD::MOVDDUP:
60861 case X86ISD::MOVSS:
60862 case X86ISD::MOVSD:
60863 case X86ISD::MOVSH:
60864 case X86ISD::VBROADCAST:
60865 case X86ISD::VPPERM:
60866 case X86ISD::VPERMI:
60867 case X86ISD::VPERMV:
60868 case X86ISD::VPERMV3:
60869 case X86ISD::VPERMIL2:
60870 case X86ISD::VPERMILPI:
60871 case X86ISD::VPERMILPV:
60872 case X86ISD::VPERM2X128:
60873 case X86ISD::SHUF128:
60874 case X86ISD::VZEXT_MOVL:
60875 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60876 case X86ISD::FMADD_RND:
60877 case X86ISD::FMSUB:
60879 case X86ISD::FMSUB_RND:
60880 case X86ISD::FNMADD:
60882 case X86ISD::FNMADD_RND:
60883 case X86ISD::FNMSUB:
60885 case X86ISD::FNMSUB_RND:
60886 case ISD::FMA:
60887 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60890 case X86ISD::FMADDSUB:
60891 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60892 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60893 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60894 case X86ISD::MGATHER:
60895 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60896 case ISD::MGATHER:
60897 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60898 case X86ISD::PCMPEQ:
60899 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60900 case X86ISD::PMULDQ:
60901 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60902 case X86ISD::VPMADDUBSW:
60903 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60904 case X86ISD::VPMADD52L:
60905 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60906 case X86ISD::KSHIFTL:
60907 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60908 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60910 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60912 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60914 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60915 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60916 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60917 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60918 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60919 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60921 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60922 // clang-format on
60923 }
60924
60925 return SDValue();
60926}
60927
60929 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60930}
60931
60932// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60934 EVT ExtVT) const {
60935 return Subtarget.hasAVX512() || !VT.isVector();
60936}
60937
60939 if (!isTypeLegal(VT))
60940 return false;
60941
60942 // There are no vXi8 shifts.
60943 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60944 return false;
60945
60946 // TODO: Almost no 8-bit ops are desirable because they have no actual
60947 // size/speed advantages vs. 32-bit ops, but they do have a major
60948 // potential disadvantage by causing partial register stalls.
60949 //
60950 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60951 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60952 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60953 // check for a constant operand to the multiply.
60954 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60955 return false;
60956
60957 // i16 instruction encodings are longer and some i16 instructions are slow,
60958 // so those are not desirable.
60959 if (VT == MVT::i16) {
60960 switch (Opc) {
60961 default:
60962 break;
60963 case ISD::LOAD:
60964 case ISD::SIGN_EXTEND:
60965 case ISD::ZERO_EXTEND:
60966 case ISD::ANY_EXTEND:
60967 case ISD::MUL:
60968 return false;
60969 case ISD::SHL:
60970 case ISD::SRA:
60971 case ISD::SRL:
60972 case ISD::SUB:
60973 case ISD::ADD:
60974 case ISD::AND:
60975 case ISD::OR:
60976 case ISD::XOR:
60977 // NDD instruction never has "partial register write" issue b/c it has
60978 // destination register's upper bits [63:OSIZE]) zeroed even when
60979 // OSIZE=8/16.
60980 return Subtarget.hasNDD();
60981 }
60982 }
60983
60984 // Any legal type not explicitly accounted for above here is desirable.
60985 return true;
60986}
60987
60989 SDValue Value, SDValue Addr,
60990 int JTI,
60991 SelectionDAG &DAG) const {
60992 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60993 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60994 if (IsCFProtectionSupported) {
60995 // In case control-flow branch protection is enabled, we need to add
60996 // notrack prefix to the indirect branch.
60997 // In order to do that we create NT_BRIND SDNode.
60998 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
60999 SDValue Chain = Value;
61000 // Jump table debug info is only needed if CodeView is enabled.
61002 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61003 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61004 }
61005
61006 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61007}
61008
61011 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61013 EVT VT = LogicOp->getValueType(0);
61014 EVT OpVT = SETCC0->getOperand(0).getValueType();
61015 if (!VT.isInteger())
61017
61018 if (VT.isVector())
61023
61024 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61025 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61026 // `NotAnd` applies, `AddAnd` does as well.
61027 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61028 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61030}
61031
61033 EVT VT = Op.getValueType();
61034 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61035 isa<ConstantSDNode>(Op.getOperand(1));
61036
61037 // i16 is legal, but undesirable since i16 instruction encodings are longer
61038 // and some i16 instructions are slow.
61039 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61040 // using LEA and/or other ALU ops.
61041 if (VT != MVT::i16 && !Is8BitMulByConstant)
61042 return false;
61043
61044 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61045 if (!Op.hasOneUse())
61046 return false;
61047 SDNode *User = *Op->user_begin();
61049 return false;
61050 auto *Ld = cast<LoadSDNode>(Load);
61051 auto *St = cast<StoreSDNode>(User);
61052 return Ld->getBasePtr() == St->getBasePtr();
61053 };
61054
61055 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61056 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61057 return false;
61058 if (!Op.hasOneUse())
61059 return false;
61060 SDNode *User = *Op->user_begin();
61061 if (User->getOpcode() != ISD::ATOMIC_STORE)
61062 return false;
61063 auto *Ld = cast<AtomicSDNode>(Load);
61064 auto *St = cast<AtomicSDNode>(User);
61065 return Ld->getBasePtr() == St->getBasePtr();
61066 };
61067
61068 auto IsFoldableZext = [](SDValue Op) {
61069 if (!Op.hasOneUse())
61070 return false;
61071 SDNode *User = *Op->user_begin();
61072 EVT VT = User->getValueType(0);
61073 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61074 (VT == MVT::i32 || VT == MVT::i64));
61075 };
61076
61077 bool Commute = false;
61078 switch (Op.getOpcode()) {
61079 default: return false;
61080 case ISD::SIGN_EXTEND:
61081 case ISD::ZERO_EXTEND:
61082 case ISD::ANY_EXTEND:
61083 break;
61084 case ISD::SHL:
61085 case ISD::SRA:
61086 case ISD::SRL: {
61087 SDValue N0 = Op.getOperand(0);
61088 // Look out for (store (shl (load), x)).
61089 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61090 return false;
61091 break;
61092 }
61093 case ISD::MUL:
61094 // When ZU is enabled, we prefer to not promote for MUL by a constant
61095 // when there is an opportunity to fold a zext with imulzu.
61096 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61097 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61098 isa<ConstantSDNode>(Op.getOperand(1))))
61099 return false;
61100 [[fallthrough]];
61101 case ISD::ADD:
61102 case ISD::AND:
61103 case ISD::OR:
61104 case ISD::XOR:
61105 Commute = true;
61106 [[fallthrough]];
61107 case ISD::SUB: {
61108 SDValue N0 = Op.getOperand(0);
61109 SDValue N1 = Op.getOperand(1);
61110 // Avoid disabling potential load folding opportunities.
61111 if (X86::mayFoldLoad(N1, Subtarget) &&
61112 (!Commute || !isa<ConstantSDNode>(N0) ||
61113 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61114 return false;
61115 if (X86::mayFoldLoad(N0, Subtarget) &&
61116 ((Commute && !isa<ConstantSDNode>(N1)) ||
61117 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61118 return false;
61119 if (IsFoldableAtomicRMW(N0, Op) ||
61120 (Commute && IsFoldableAtomicRMW(N1, Op)))
61121 return false;
61122 }
61123 }
61124
61125 PVT = MVT::i32;
61126 return true;
61127}
61128
61129//===----------------------------------------------------------------------===//
61130// X86 Inline Assembly Support
61131//===----------------------------------------------------------------------===//
61132
61135 .Case("{@cca}", X86::COND_A)
61136 .Case("{@ccae}", X86::COND_AE)
61137 .Case("{@ccb}", X86::COND_B)
61138 .Case("{@ccbe}", X86::COND_BE)
61139 .Case("{@ccc}", X86::COND_B)
61140 .Case("{@cce}", X86::COND_E)
61141 .Case("{@ccz}", X86::COND_E)
61142 .Case("{@ccg}", X86::COND_G)
61143 .Case("{@ccge}", X86::COND_GE)
61144 .Case("{@ccl}", X86::COND_L)
61145 .Case("{@ccle}", X86::COND_LE)
61146 .Case("{@ccna}", X86::COND_BE)
61147 .Case("{@ccnae}", X86::COND_B)
61148 .Case("{@ccnb}", X86::COND_AE)
61149 .Case("{@ccnbe}", X86::COND_A)
61150 .Case("{@ccnc}", X86::COND_AE)
61151 .Case("{@ccne}", X86::COND_NE)
61152 .Case("{@ccnz}", X86::COND_NE)
61153 .Case("{@ccng}", X86::COND_LE)
61154 .Case("{@ccnge}", X86::COND_L)
61155 .Case("{@ccnl}", X86::COND_GE)
61156 .Case("{@ccnle}", X86::COND_G)
61157 .Case("{@ccno}", X86::COND_NO)
61158 .Case("{@ccnp}", X86::COND_NP)
61159 .Case("{@ccns}", X86::COND_NS)
61160 .Case("{@cco}", X86::COND_O)
61161 .Case("{@ccp}", X86::COND_P)
61162 .Case("{@ccs}", X86::COND_S)
61164 return Cond;
61165}
61166
61167/// Given a constraint letter, return the type of constraint for this target.
61170 if (Constraint.size() == 1) {
61171 switch (Constraint[0]) {
61172 case 'R':
61173 case 'q':
61174 case 'Q':
61175 case 'f':
61176 case 't':
61177 case 'u':
61178 case 'y':
61179 case 'x':
61180 case 'v':
61181 case 'l':
61182 case 'k': // AVX512 masking registers.
61183 return C_RegisterClass;
61184 case 'a':
61185 case 'b':
61186 case 'c':
61187 case 'd':
61188 case 'S':
61189 case 'D':
61190 case 'A':
61191 return C_Register;
61192 case 'I':
61193 case 'J':
61194 case 'K':
61195 case 'N':
61196 case 'G':
61197 case 'L':
61198 case 'M':
61199 return C_Immediate;
61200 case 'C':
61201 case 'e':
61202 case 'Z':
61203 return C_Other;
61204 default:
61205 break;
61206 }
61207 }
61208 else if (Constraint.size() == 2) {
61209 switch (Constraint[0]) {
61210 default:
61211 break;
61212 case 'W':
61213 if (Constraint[1] != 's')
61214 break;
61215 return C_Other;
61216 case 'Y':
61217 switch (Constraint[1]) {
61218 default:
61219 break;
61220 case 'z':
61221 return C_Register;
61222 case 'i':
61223 case 'm':
61224 case 'k':
61225 case 't':
61226 case '2':
61227 return C_RegisterClass;
61228 }
61229 break;
61230 case 'j':
61231 switch (Constraint[1]) {
61232 default:
61233 break;
61234 case 'r':
61235 case 'R':
61236 return C_RegisterClass;
61237 }
61238 }
61239 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61240 return C_Other;
61241 return TargetLowering::getConstraintType(Constraint);
61242}
61243
61244/// Examine constraint type and operand type and determine a weight value.
61245/// This object must already have been set up with the operand type
61246/// and the current alternative constraint selected.
61249 AsmOperandInfo &Info, const char *Constraint) const {
61251 Value *CallOperandVal = Info.CallOperandVal;
61252 // If we don't have a value, we can't do a match,
61253 // but allow it at the lowest weight.
61254 if (!CallOperandVal)
61255 return CW_Default;
61256 Type *Ty = CallOperandVal->getType();
61257 // Look at the constraint type.
61258 switch (*Constraint) {
61259 default:
61261 [[fallthrough]];
61262 case 'R':
61263 case 'q':
61264 case 'Q':
61265 case 'a':
61266 case 'b':
61267 case 'c':
61268 case 'd':
61269 case 'S':
61270 case 'D':
61271 case 'A':
61272 if (CallOperandVal->getType()->isIntegerTy())
61273 Wt = CW_SpecificReg;
61274 break;
61275 case 'f':
61276 case 't':
61277 case 'u':
61278 if (Ty->isFloatingPointTy())
61279 Wt = CW_SpecificReg;
61280 break;
61281 case 'y':
61282 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61283 Wt = CW_SpecificReg;
61284 break;
61285 case 'Y':
61286 if (StringRef(Constraint).size() != 2)
61287 break;
61288 switch (Constraint[1]) {
61289 default:
61290 return CW_Invalid;
61291 // XMM0
61292 case 'z':
61293 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61294 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61295 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61296 return CW_SpecificReg;
61297 return CW_Invalid;
61298 // Conditional OpMask regs (AVX512)
61299 case 'k':
61300 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61301 return CW_Register;
61302 return CW_Invalid;
61303 // Any MMX reg
61304 case 'm':
61305 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61306 return CW_SpecificReg;
61307 return CW_Invalid;
61308 // Any SSE reg when ISA >= SSE2, same as 'x'
61309 case 'i':
61310 case 't':
61311 case '2':
61312 if (!Subtarget.hasSSE2())
61313 return CW_Invalid;
61314 break;
61315 }
61316 break;
61317 case 'j':
61318 if (StringRef(Constraint).size() != 2)
61319 break;
61320 switch (Constraint[1]) {
61321 default:
61322 return CW_Invalid;
61323 case 'r':
61324 case 'R':
61325 if (CallOperandVal->getType()->isIntegerTy())
61326 Wt = CW_SpecificReg;
61327 break;
61328 }
61329 break;
61330 case 'v':
61331 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61332 Wt = CW_Register;
61333 [[fallthrough]];
61334 case 'x':
61335 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61336 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61337 Wt = CW_Register;
61338 break;
61339 case 'k':
61340 // Enable conditional vector operations using %k<#> registers.
61341 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61342 Wt = CW_Register;
61343 break;
61344 case 'I':
61345 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61346 if (C->getZExtValue() <= 31)
61347 Wt = CW_Constant;
61348 break;
61349 case 'J':
61350 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61351 if (C->getZExtValue() <= 63)
61352 Wt = CW_Constant;
61353 break;
61354 case 'K':
61355 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61356 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61357 Wt = CW_Constant;
61358 break;
61359 case 'L':
61360 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61361 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61362 Wt = CW_Constant;
61363 break;
61364 case 'M':
61365 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61366 if (C->getZExtValue() <= 3)
61367 Wt = CW_Constant;
61368 break;
61369 case 'N':
61370 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61371 if (C->getZExtValue() <= 0xff)
61372 Wt = CW_Constant;
61373 break;
61374 case 'G':
61375 case 'C':
61376 if (isa<ConstantFP>(CallOperandVal))
61377 Wt = CW_Constant;
61378 break;
61379 case 'e':
61380 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61381 if ((C->getSExtValue() >= -0x80000000LL) &&
61382 (C->getSExtValue() <= 0x7fffffffLL))
61383 Wt = CW_Constant;
61384 break;
61385 case 'Z':
61386 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61387 if (C->getZExtValue() <= 0xffffffff)
61388 Wt = CW_Constant;
61389 break;
61390 }
61391 return Wt;
61392}
61393
61394/// Try to replace an X constraint, which matches anything, with another that
61395/// has more specific requirements based on the type of the corresponding
61396/// operand.
61398LowerXConstraint(EVT ConstraintVT) const {
61399 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61400 // 'f' like normal targets.
61401 if (ConstraintVT.isFloatingPoint()) {
61402 if (Subtarget.hasSSE1())
61403 return "x";
61404 }
61405
61406 return TargetLowering::LowerXConstraint(ConstraintVT);
61407}
61408
61409// Lower @cc targets via setcc.
61411 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61412 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61413 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61414 if (Cond == X86::COND_INVALID)
61415 return SDValue();
61416 // Check that return type is valid.
61417 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61418 OpInfo.ConstraintVT.getSizeInBits() < 8)
61419 report_fatal_error("Glue output operand is of invalid type");
61420
61421 // Get EFLAGS register. Only update chain when copyfrom is glued.
61422 if (Glue.getNode()) {
61423 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61424 Chain = Glue.getValue(1);
61425 } else
61426 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61427 // Extract CC code.
61428 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61429 // Extend to 32-bits
61430 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61431
61432 return Result;
61433}
61434
61435/// Lower the specified operand into the Ops vector.
61436/// If it is invalid, don't add anything to Ops.
61438 StringRef Constraint,
61439 std::vector<SDValue> &Ops,
61440 SelectionDAG &DAG) const {
61441 SDValue Result;
61442 char ConstraintLetter = Constraint[0];
61443 switch (ConstraintLetter) {
61444 default: break;
61445 case 'I':
61446 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61447 if (C->getZExtValue() <= 31) {
61448 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61449 Op.getValueType());
61450 break;
61451 }
61452 }
61453 return;
61454 case 'J':
61455 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61456 if (C->getZExtValue() <= 63) {
61457 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61458 Op.getValueType());
61459 break;
61460 }
61461 }
61462 return;
61463 case 'K':
61464 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61465 if (isInt<8>(C->getSExtValue())) {
61466 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61467 Op.getValueType());
61468 break;
61469 }
61470 }
61471 return;
61472 case 'L':
61473 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61474 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61475 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61476 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61477 Op.getValueType());
61478 break;
61479 }
61480 }
61481 return;
61482 case 'M':
61483 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61484 if (C->getZExtValue() <= 3) {
61485 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61486 Op.getValueType());
61487 break;
61488 }
61489 }
61490 return;
61491 case 'N':
61492 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61493 if (C->getZExtValue() <= 255) {
61494 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61495 Op.getValueType());
61496 break;
61497 }
61498 }
61499 return;
61500 case 'O':
61501 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61502 if (C->getZExtValue() <= 127) {
61503 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61504 Op.getValueType());
61505 break;
61506 }
61507 }
61508 return;
61509 case 'e': {
61510 // 32-bit signed value
61511 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61513 C->getSExtValue())) {
61514 // Widen to 64 bits here to get it sign extended.
61515 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61516 break;
61517 }
61518 // FIXME gcc accepts some relocatable values here too, but only in certain
61519 // memory models; it's complicated.
61520 }
61521 return;
61522 }
61523 case 'W': {
61524 assert(Constraint[1] == 's');
61525 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61526 // offset.
61527 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61528 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61529 BA->getValueType(0)));
61530 } else {
61531 int64_t Offset = 0;
61532 if (Op->getOpcode() == ISD::ADD &&
61533 isa<ConstantSDNode>(Op->getOperand(1))) {
61534 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61535 Op = Op->getOperand(0);
61536 }
61537 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61538 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61539 GA->getValueType(0), Offset));
61540 }
61541 return;
61542 }
61543 case 'Z': {
61544 // 32-bit unsigned value
61545 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61547 C->getZExtValue())) {
61548 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61549 Op.getValueType());
61550 break;
61551 }
61552 }
61553 // FIXME gcc accepts some relocatable values here too, but only in certain
61554 // memory models; it's complicated.
61555 return;
61556 }
61557 case 'i': {
61558 // Literal immediates are always ok.
61559 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61560 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61561 BooleanContent BCont = getBooleanContents(MVT::i64);
61562 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61564 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61565 : CST->getSExtValue();
61566 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61567 break;
61568 }
61569
61570 // In any sort of PIC mode addresses need to be computed at runtime by
61571 // adding in a register or some sort of table lookup. These can't
61572 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61573 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61575 return;
61576
61577 // If we are in non-pic codegen mode, we allow the address of a global (with
61578 // an optional displacement) to be used with 'i'.
61579 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61580 // If we require an extra load to get this address, as in PIC mode, we
61581 // can't accept it.
61583 Subtarget.classifyGlobalReference(GA->getGlobal())))
61584 return;
61585 break;
61586 }
61587 }
61588
61589 if (Result.getNode()) {
61590 Ops.push_back(Result);
61591 return;
61592 }
61593 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61594}
61595
61596/// Check if \p RC is a general purpose register class.
61597/// I.e., GR* or one of their variant.
61598static bool isGRClass(const TargetRegisterClass &RC) {
61599 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61600 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61601 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61602 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61603 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61604}
61605
61606/// Check if \p RC is a vector register class.
61607/// I.e., FR* / VR* or one of their variant.
61608static bool isFRClass(const TargetRegisterClass &RC) {
61609 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61610 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61611 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61612 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61613 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61614 RC.hasSuperClassEq(&X86::VR512RegClass);
61615}
61616
61617/// Check if \p RC is a mask register class.
61618/// I.e., VK* or one of their variant.
61619static bool isVKClass(const TargetRegisterClass &RC) {
61620 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61621 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61622 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61623 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61624 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61625 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61626 RC.hasSuperClassEq(&X86::VK64RegClass);
61627}
61628
61629static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61630 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61631}
61632
61633std::pair<unsigned, const TargetRegisterClass *>
61635 StringRef Constraint,
61636 MVT VT) const {
61637 // First, see if this is a constraint that directly corresponds to an LLVM
61638 // register class.
61639 if (Constraint.size() == 1) {
61640 // GCC Constraint Letters
61641 switch (Constraint[0]) {
61642 default: break;
61643 // 'A' means [ER]AX + [ER]DX.
61644 case 'A':
61645 if (Subtarget.is64Bit())
61646 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61647 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61648 "Expecting 64, 32 or 16 bit subtarget");
61649 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61650
61651 // TODO: Slight differences here in allocation order and leaving
61652 // RIP in the class. Do they matter any more here than they do
61653 // in the normal allocation?
61654 case 'k':
61655 if (Subtarget.hasAVX512()) {
61656 if (VT == MVT::v1i1 || VT == MVT::i1)
61657 return std::make_pair(0U, &X86::VK1RegClass);
61658 if (VT == MVT::v8i1 || VT == MVT::i8)
61659 return std::make_pair(0U, &X86::VK8RegClass);
61660 if (VT == MVT::v16i1 || VT == MVT::i16)
61661 return std::make_pair(0U, &X86::VK16RegClass);
61662 }
61663 if (Subtarget.hasBWI()) {
61664 if (VT == MVT::v32i1 || VT == MVT::i32)
61665 return std::make_pair(0U, &X86::VK32RegClass);
61666 if (VT == MVT::v64i1 || VT == MVT::i64)
61667 return std::make_pair(0U, &X86::VK64RegClass);
61668 }
61669 break;
61670 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61671 if (Subtarget.is64Bit()) {
61672 if (VT == MVT::i8 || VT == MVT::i1)
61673 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61674 ? &X86::GR8RegClass
61675 : &X86::GR8_NOREX2RegClass);
61676 if (VT == MVT::i16)
61677 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61678 ? &X86::GR16RegClass
61679 : &X86::GR16_NOREX2RegClass);
61680 if (VT == MVT::i32 || VT == MVT::f32)
61681 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61682 ? &X86::GR32RegClass
61683 : &X86::GR32_NOREX2RegClass);
61684 if (VT != MVT::f80 && !VT.isVector())
61685 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61686 ? &X86::GR64RegClass
61687 : &X86::GR64_NOREX2RegClass);
61688 break;
61689 }
61690 [[fallthrough]];
61691 // 32-bit fallthrough
61692 case 'Q': // Q_REGS
61693 if (VT == MVT::i8 || VT == MVT::i1)
61694 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61695 if (VT == MVT::i16)
61696 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61697 if (VT == MVT::i32 || VT == MVT::f32 ||
61698 (!VT.isVector() && !Subtarget.is64Bit()))
61699 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61700 if (VT != MVT::f80 && !VT.isVector())
61701 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61702 break;
61703 case 'r': // GENERAL_REGS
61704 case 'l': // INDEX_REGS
61705 if (VT == MVT::i8 || VT == MVT::i1)
61706 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61707 ? &X86::GR8RegClass
61708 : &X86::GR8_NOREX2RegClass);
61709 if (VT == MVT::i16)
61710 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61711 ? &X86::GR16RegClass
61712 : &X86::GR16_NOREX2RegClass);
61713 if (VT == MVT::i32 || VT == MVT::f32 ||
61714 (!VT.isVector() && !Subtarget.is64Bit()))
61715 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61716 ? &X86::GR32RegClass
61717 : &X86::GR32_NOREX2RegClass);
61718 if (VT != MVT::f80 && !VT.isVector())
61719 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61720 ? &X86::GR64RegClass
61721 : &X86::GR64_NOREX2RegClass);
61722 break;
61723 case 'R': // LEGACY_REGS
61724 if (VT == MVT::i8 || VT == MVT::i1)
61725 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61726 if (VT == MVT::i16)
61727 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61728 if (VT == MVT::i32 || VT == MVT::f32 ||
61729 (!VT.isVector() && !Subtarget.is64Bit()))
61730 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61731 if (VT != MVT::f80 && !VT.isVector())
61732 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61733 break;
61734 case 'f': // FP Stack registers.
61735 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61736 // value to the correct fpstack register class.
61737 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61738 return std::make_pair(0U, &X86::RFP32RegClass);
61739 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61740 return std::make_pair(0U, &X86::RFP64RegClass);
61741 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61742 return std::make_pair(0U, &X86::RFP80RegClass);
61743 break;
61744 case 'y': // MMX_REGS if MMX allowed.
61745 if (!Subtarget.hasMMX()) break;
61746 return std::make_pair(0U, &X86::VR64RegClass);
61747 case 'v':
61748 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61749 if (!Subtarget.hasSSE1()) break;
61750 bool VConstraint = (Constraint[0] == 'v');
61751
61752 switch (VT.SimpleTy) {
61753 default: break;
61754 // Scalar SSE types.
61755 case MVT::f16:
61756 if (VConstraint && Subtarget.hasFP16())
61757 return std::make_pair(0U, &X86::FR16XRegClass);
61758 break;
61759 case MVT::f32:
61760 case MVT::i32:
61761 if (VConstraint && Subtarget.hasVLX())
61762 return std::make_pair(0U, &X86::FR32XRegClass);
61763 return std::make_pair(0U, &X86::FR32RegClass);
61764 case MVT::f64:
61765 case MVT::i64:
61766 if (VConstraint && Subtarget.hasVLX())
61767 return std::make_pair(0U, &X86::FR64XRegClass);
61768 return std::make_pair(0U, &X86::FR64RegClass);
61769 case MVT::i128:
61770 if (Subtarget.is64Bit()) {
61771 if (VConstraint && Subtarget.hasVLX())
61772 return std::make_pair(0U, &X86::VR128XRegClass);
61773 return std::make_pair(0U, &X86::VR128RegClass);
61774 }
61775 break;
61776 // Vector types and fp128.
61777 case MVT::v8f16:
61778 if (!Subtarget.hasFP16())
61779 break;
61780 if (VConstraint)
61781 return std::make_pair(0U, &X86::VR128XRegClass);
61782 return std::make_pair(0U, &X86::VR128RegClass);
61783 case MVT::v8bf16:
61784 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61785 break;
61786 if (VConstraint)
61787 return std::make_pair(0U, &X86::VR128XRegClass);
61788 return std::make_pair(0U, &X86::VR128RegClass);
61789 case MVT::f128:
61790 if (!Subtarget.is64Bit())
61791 break;
61792 [[fallthrough]];
61793 case MVT::v16i8:
61794 case MVT::v8i16:
61795 case MVT::v4i32:
61796 case MVT::v2i64:
61797 case MVT::v4f32:
61798 case MVT::v2f64:
61799 if (VConstraint && Subtarget.hasVLX())
61800 return std::make_pair(0U, &X86::VR128XRegClass);
61801 return std::make_pair(0U, &X86::VR128RegClass);
61802 // AVX types.
61803 case MVT::v16f16:
61804 if (!Subtarget.hasFP16())
61805 break;
61806 if (VConstraint)
61807 return std::make_pair(0U, &X86::VR256XRegClass);
61808 return std::make_pair(0U, &X86::VR256RegClass);
61809 case MVT::v16bf16:
61810 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61811 break;
61812 if (VConstraint)
61813 return std::make_pair(0U, &X86::VR256XRegClass);
61814 return std::make_pair(0U, &X86::VR256RegClass);
61815 case MVT::v32i8:
61816 case MVT::v16i16:
61817 case MVT::v8i32:
61818 case MVT::v4i64:
61819 case MVT::v8f32:
61820 case MVT::v4f64:
61821 if (VConstraint && Subtarget.hasVLX())
61822 return std::make_pair(0U, &X86::VR256XRegClass);
61823 if (Subtarget.hasAVX())
61824 return std::make_pair(0U, &X86::VR256RegClass);
61825 break;
61826 case MVT::v32f16:
61827 if (!Subtarget.hasFP16())
61828 break;
61829 if (VConstraint)
61830 return std::make_pair(0U, &X86::VR512RegClass);
61831 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61832 case MVT::v32bf16:
61833 if (!Subtarget.hasBF16())
61834 break;
61835 if (VConstraint)
61836 return std::make_pair(0U, &X86::VR512RegClass);
61837 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61838 case MVT::v64i8:
61839 case MVT::v32i16:
61840 case MVT::v8f64:
61841 case MVT::v16f32:
61842 case MVT::v16i32:
61843 case MVT::v8i64:
61844 if (!Subtarget.hasAVX512()) break;
61845 if (VConstraint)
61846 return std::make_pair(0U, &X86::VR512RegClass);
61847 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61848 }
61849 break;
61850 }
61851 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61852 switch (Constraint[1]) {
61853 default:
61854 break;
61855 case 'i':
61856 case 't':
61857 case '2':
61858 return getRegForInlineAsmConstraint(TRI, "x", VT);
61859 case 'm':
61860 if (!Subtarget.hasMMX()) break;
61861 return std::make_pair(0U, &X86::VR64RegClass);
61862 case 'z':
61863 if (!Subtarget.hasSSE1()) break;
61864 switch (VT.SimpleTy) {
61865 default: break;
61866 // Scalar SSE types.
61867 case MVT::f16:
61868 if (!Subtarget.hasFP16())
61869 break;
61870 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61871 case MVT::f32:
61872 case MVT::i32:
61873 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61874 case MVT::f64:
61875 case MVT::i64:
61876 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61877 case MVT::v8f16:
61878 if (!Subtarget.hasFP16())
61879 break;
61880 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61881 case MVT::v8bf16:
61882 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61883 break;
61884 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61885 case MVT::f128:
61886 case MVT::v16i8:
61887 case MVT::v8i16:
61888 case MVT::v4i32:
61889 case MVT::v2i64:
61890 case MVT::v4f32:
61891 case MVT::v2f64:
61892 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61893 // AVX types.
61894 case MVT::v16f16:
61895 if (!Subtarget.hasFP16())
61896 break;
61897 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61898 case MVT::v16bf16:
61899 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61900 break;
61901 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61902 case MVT::v32i8:
61903 case MVT::v16i16:
61904 case MVT::v8i32:
61905 case MVT::v4i64:
61906 case MVT::v8f32:
61907 case MVT::v4f64:
61908 if (Subtarget.hasAVX())
61909 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61910 break;
61911 case MVT::v32f16:
61912 if (!Subtarget.hasFP16())
61913 break;
61914 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61915 case MVT::v32bf16:
61916 if (!Subtarget.hasBF16())
61917 break;
61918 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61919 case MVT::v64i8:
61920 case MVT::v32i16:
61921 case MVT::v8f64:
61922 case MVT::v16f32:
61923 case MVT::v16i32:
61924 case MVT::v8i64:
61925 if (Subtarget.hasAVX512())
61926 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61927 break;
61928 }
61929 break;
61930 case 'k':
61931 // This register class doesn't allocate k0 for masked vector operation.
61932 if (Subtarget.hasAVX512()) {
61933 if (VT == MVT::v1i1 || VT == MVT::i1)
61934 return std::make_pair(0U, &X86::VK1WMRegClass);
61935 if (VT == MVT::v8i1 || VT == MVT::i8)
61936 return std::make_pair(0U, &X86::VK8WMRegClass);
61937 if (VT == MVT::v16i1 || VT == MVT::i16)
61938 return std::make_pair(0U, &X86::VK16WMRegClass);
61939 }
61940 if (Subtarget.hasBWI()) {
61941 if (VT == MVT::v32i1 || VT == MVT::i32)
61942 return std::make_pair(0U, &X86::VK32WMRegClass);
61943 if (VT == MVT::v64i1 || VT == MVT::i64)
61944 return std::make_pair(0U, &X86::VK64WMRegClass);
61945 }
61946 break;
61947 }
61948 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61949 switch (Constraint[1]) {
61950 default:
61951 break;
61952 case 'r':
61953 if (VT == MVT::i8 || VT == MVT::i1)
61954 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61955 if (VT == MVT::i16)
61956 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61957 if (VT == MVT::i32 || VT == MVT::f32)
61958 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61959 if (VT != MVT::f80 && !VT.isVector())
61960 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61961 break;
61962 case 'R':
61963 if (VT == MVT::i8 || VT == MVT::i1)
61964 return std::make_pair(0U, &X86::GR8RegClass);
61965 if (VT == MVT::i16)
61966 return std::make_pair(0U, &X86::GR16RegClass);
61967 if (VT == MVT::i32 || VT == MVT::f32)
61968 return std::make_pair(0U, &X86::GR32RegClass);
61969 if (VT != MVT::f80 && !VT.isVector())
61970 return std::make_pair(0U, &X86::GR64RegClass);
61971 break;
61972 }
61973 }
61974
61975 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61976 return std::make_pair(0U, &X86::GR32RegClass);
61977
61978 // Use the default implementation in TargetLowering to convert the register
61979 // constraint into a member of a register class.
61980 std::pair<Register, const TargetRegisterClass*> Res;
61982
61983 // Not found as a standard register?
61984 if (!Res.second) {
61985 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61986 // to/from f80.
61987 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61988 // Map st(0) -> st(7) -> ST0
61989 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61990 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61991 Constraint[3] == '(' &&
61992 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61993 Constraint[5] == ')' && Constraint[6] == '}') {
61994 // st(7) is not allocatable and thus not a member of RFP80. Return
61995 // singleton class in cases where we have a reference to it.
61996 if (Constraint[4] == '7')
61997 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
61998 return std::make_pair(X86::FP0 + Constraint[4] - '0',
61999 &X86::RFP80RegClass);
62000 }
62001
62002 // GCC allows "st(0)" to be called just plain "st".
62003 if (StringRef("{st}").equals_insensitive(Constraint))
62004 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62005 }
62006
62007 // flags -> EFLAGS
62008 if (StringRef("{flags}").equals_insensitive(Constraint))
62009 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62010
62011 // dirflag -> DF
62012 // Only allow for clobber.
62013 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62014 VT == MVT::Other)
62015 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62016
62017 // fpsr -> FPSW
62018 // Only allow for clobber.
62019 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62020 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62021
62022 return Res;
62023 }
62024
62025 // Make sure it isn't a register that requires 64-bit mode.
62026 if (!Subtarget.is64Bit() &&
62027 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62028 TRI->getEncodingValue(Res.first) >= 8) {
62029 // Register requires REX prefix, but we're in 32-bit mode.
62030 return std::make_pair(0, nullptr);
62031 }
62032
62033 // Make sure it isn't a register that requires AVX512.
62034 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62035 TRI->getEncodingValue(Res.first) & 0x10) {
62036 // Register requires EVEX prefix.
62037 return std::make_pair(0, nullptr);
62038 }
62039
62040 // Otherwise, check to see if this is a register class of the wrong value
62041 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62042 // turn into {ax},{dx}.
62043 // MVT::Other is used to specify clobber names.
62044 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62045 return Res; // Correct type already, nothing to do.
62046
62047 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62048 // return "eax". This should even work for things like getting 64bit integer
62049 // registers when given an f64 type.
62050 const TargetRegisterClass *Class = Res.second;
62051 // The generic code will match the first register class that contains the
62052 // given register. Thus, based on the ordering of the tablegened file,
62053 // the "plain" GR classes might not come first.
62054 // Therefore, use a helper method.
62055 if (isGRClass(*Class)) {
62056 unsigned Size = VT.getSizeInBits();
62057 if (Size == 1) Size = 8;
62058 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62059 return std::make_pair(0, nullptr);
62060 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62061 if (DestReg.isValid()) {
62062 bool is64Bit = Subtarget.is64Bit();
62063 const TargetRegisterClass *RC =
62064 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62065 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62066 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62067 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62068 if (Size == 64 && !is64Bit) {
62069 // Model GCC's behavior here and select a fixed pair of 32-bit
62070 // registers.
62071 switch (DestReg) {
62072 case X86::RAX:
62073 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62074 case X86::RDX:
62075 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62076 case X86::RCX:
62077 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62078 case X86::RBX:
62079 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62080 case X86::RSI:
62081 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62082 case X86::RDI:
62083 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62084 case X86::RBP:
62085 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62086 default:
62087 return std::make_pair(0, nullptr);
62088 }
62089 }
62090 if (RC && RC->contains(DestReg))
62091 return std::make_pair(DestReg, RC);
62092 return Res;
62093 }
62094 // No register found/type mismatch.
62095 return std::make_pair(0, nullptr);
62096 } else if (isFRClass(*Class)) {
62097 // Handle references to XMM physical registers that got mapped into the
62098 // wrong class. This can happen with constraints like {xmm0} where the
62099 // target independent register mapper will just pick the first match it can
62100 // find, ignoring the required type.
62101
62102 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62103 if (VT == MVT::f16)
62104 Res.second = &X86::FR16XRegClass;
62105 else if (VT == MVT::f32 || VT == MVT::i32)
62106 Res.second = &X86::FR32XRegClass;
62107 else if (VT == MVT::f64 || VT == MVT::i64)
62108 Res.second = &X86::FR64XRegClass;
62109 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62110 Res.second = &X86::VR128XRegClass;
62111 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62112 Res.second = &X86::VR256XRegClass;
62113 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62114 Res.second = &X86::VR512RegClass;
62115 else {
62116 // Type mismatch and not a clobber: Return an error;
62117 Res.first = 0;
62118 Res.second = nullptr;
62119 }
62120 } else if (isVKClass(*Class)) {
62121 if (VT == MVT::v1i1 || VT == MVT::i1)
62122 Res.second = &X86::VK1RegClass;
62123 else if (VT == MVT::v8i1 || VT == MVT::i8)
62124 Res.second = &X86::VK8RegClass;
62125 else if (VT == MVT::v16i1 || VT == MVT::i16)
62126 Res.second = &X86::VK16RegClass;
62127 else if (VT == MVT::v32i1 || VT == MVT::i32)
62128 Res.second = &X86::VK32RegClass;
62129 else if (VT == MVT::v64i1 || VT == MVT::i64)
62130 Res.second = &X86::VK64RegClass;
62131 else {
62132 // Type mismatch and not a clobber: Return an error;
62133 Res.first = 0;
62134 Res.second = nullptr;
62135 }
62136 }
62137
62138 return Res;
62139}
62140
62141bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62142 // Integer division on x86 is expensive. However, when aggressively optimizing
62143 // for code size, we prefer to use a div instruction, as it is usually smaller
62144 // than the alternative sequence.
62145 // The exception to this is vector division. Since x86 doesn't have vector
62146 // integer division, leaving the division as-is is a loss even in terms of
62147 // size, because it will have to be scalarized, while the alternative code
62148 // sequence can be performed in vector form.
62149 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62150 return OptSize && !VT.isVector();
62151}
62152
62153void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62154 if (!Subtarget.is64Bit())
62155 return;
62156
62157 // Update IsSplitCSR in X86MachineFunctionInfo.
62159 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62160 AFI->setIsSplitCSR(true);
62161}
62162
62163void X86TargetLowering::insertCopiesSplitCSR(
62164 MachineBasicBlock *Entry,
62165 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62166 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62167 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62168 if (!IStart)
62169 return;
62170
62171 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62172 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62173 MachineBasicBlock::iterator MBBI = Entry->begin();
62174 for (const MCPhysReg *I = IStart; *I; ++I) {
62175 const TargetRegisterClass *RC = nullptr;
62176 if (X86::GR64RegClass.contains(*I))
62177 RC = &X86::GR64RegClass;
62178 else
62179 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62180
62181 Register NewVR = MRI->createVirtualRegister(RC);
62182 // Create copy from CSR to a virtual register.
62183 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62184 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62185 // nounwind. If we want to generalize this later, we may need to emit
62186 // CFI pseudo-instructions.
62187 assert(
62188 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62189 "Function should be nounwind in insertCopiesSplitCSR!");
62190 Entry->addLiveIn(*I);
62191 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62192 .addReg(*I);
62193
62194 // Insert the copy-back instructions right before the terminator.
62195 for (auto *Exit : Exits)
62196 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62197 TII->get(TargetOpcode::COPY), *I)
62198 .addReg(NewVR);
62199 }
62200}
62201
62203 return Subtarget.is64Bit();
62204}
62205
62209 const TargetInstrInfo *TII) const {
62210 assert(MBBI->isCall() && MBBI->getCFIType() &&
62211 "Invalid call instruction for a KCFI check");
62212
62213 MachineFunction &MF = *MBB.getParent();
62214 // If the call target is a memory operand, unfold it and use R11 for the
62215 // call, so KCFI_CHECK won't have to recompute the address.
62216 switch (MBBI->getOpcode()) {
62217 case X86::CALL64m:
62218 case X86::CALL64m_NT:
62219 case X86::TAILJMPm64:
62220 case X86::TAILJMPm64_REX: {
62223 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62224 /*UnfoldStore=*/false, NewMIs))
62225 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62226 for (auto *NewMI : NewMIs)
62227 MBBI = MBB.insert(OrigCall, NewMI);
62228 assert(MBBI->isCall() &&
62229 "Unexpected instruction after memory operand unfolding");
62230 if (OrigCall->shouldUpdateAdditionalCallInfo())
62231 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62232 MBBI->setCFIType(MF, OrigCall->getCFIType());
62233 OrigCall->eraseFromParent();
62234 break;
62235 }
62236 default:
62237 break;
62238 }
62239
62240 MachineOperand &Target = MBBI->getOperand(0);
62241 Register TargetReg;
62242 switch (MBBI->getOpcode()) {
62243 case X86::CALL64r:
62244 case X86::CALL64r_ImpCall:
62245 case X86::CALL64r_NT:
62246 case X86::TAILJMPr64:
62247 case X86::TAILJMPr64_REX:
62248 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62249 Target.setIsRenamable(false);
62250 TargetReg = Target.getReg();
62251 break;
62252 case X86::CALL64pcrel32:
62253 case X86::TAILJMPd64:
62254 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62255 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62256 // 64-bit indirect thunk calls.
62257 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62258 "Unexpected register for an indirect thunk call");
62259 TargetReg = X86::R11;
62260 break;
62261 default:
62262 llvm_unreachable("Unexpected CFI call opcode");
62263 break;
62264 }
62265
62266 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62267 .addReg(TargetReg)
62268 .addImm(MBBI->getCFIType())
62269 .getInstr();
62270}
62271
62272/// Returns true if stack probing through a function call is requested.
62276
62277/// Returns true if stack probing through inline assembly is requested.
62279
62280 // No inline stack probe for Windows, they have their own mechanism.
62281 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62282 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62283 return false;
62284
62285 // If the function specifically requests inline stack probes, emit them.
62286 if (MF.getFunction().hasFnAttribute("probe-stack"))
62287 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62288 "inline-asm";
62289
62290 return false;
62291}
62292
62293/// Returns the name of the symbol used to emit stack probes or the empty
62294/// string if not applicable.
62297 // Inline Stack probes disable stack probe call
62298 if (hasInlineStackProbe(MF))
62299 return "";
62300
62301 // If the function specifically requests stack probes, emit them.
62302 if (MF.getFunction().hasFnAttribute("probe-stack"))
62303 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62304
62305 // Generally, if we aren't on Windows, the platform ABI does not include
62306 // support for stack probes, so don't emit them.
62307 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62308 Subtarget.isTargetMachO() ||
62309 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62310 return "";
62311
62312 // We need a stack probe to conform to the Windows ABI. Choose the right
62313 // symbol.
62314 if (Subtarget.is64Bit())
62315 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62316 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62317}
62318
62319unsigned
62321 // The default stack probe size is 4096 if the function has no stackprobesize
62322 // attribute.
62323 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62324 4096);
62325}
62326
62328 if (ML && ML->isInnermost() &&
62329 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62332}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:188
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:436
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2056
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1757
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1840
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1934
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1815
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1941
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2088
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1584
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.