LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
98static cl::opt<bool>
99 WidenShift("x86-widen-shift", cl::init(true),
100 cl::desc("Replace narrow shifts with wider shifts."),
101 cl::Hidden);
102
104 "x86-br-merging-likely-bias", cl::init(0),
105 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
111 "branches."),
112 cl::Hidden);
113
115 "x86-br-merging-unlikely-bias", cl::init(-1),
116 cl::desc(
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
123 "branches."),
124 cl::Hidden);
125
127 "mul-constant-optimization", cl::init(true),
128 cl::desc("Replace 'mul x, Const' with more effective instructions like "
129 "SHIFT, LEA, etc."),
130 cl::Hidden);
131
133 const X86Subtarget &STI)
134 : TargetLowering(TM), Subtarget(STI) {
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
136 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
137
138 // Set up the TargetLowering object.
139
140 // X86 is weird. It always uses i8 for shift amounts and setcc results.
142 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
144
145 // X86 instruction cache is coherent with its data cache so we can use the
146 // default expansion to a no-op.
148
149 // For 64-bit, since we have so many registers, use the ILP scheduler.
150 // For 32-bit, use the register pressure specific scheduling.
151 // For Atom, always use ILP scheduling.
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
156 else
158 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
159 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
160
161 // Bypass expensive divides and use cheaper ones.
162 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
163 if (Subtarget.hasSlowDivide32())
164 addBypassSlowDiv(32, 8);
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
166 addBypassSlowDiv(64, 32);
167 }
168
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
173 else
175
176 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
177
179
180 // Set up the register classes.
181 addRegisterClass(MVT::i8, &X86::GR8RegClass);
182 addRegisterClass(MVT::i16, &X86::GR16RegClass);
183 addRegisterClass(MVT::i32, &X86::GR32RegClass);
184 if (Subtarget.is64Bit())
185 addRegisterClass(MVT::i64, &X86::GR64RegClass);
186
187 for (MVT VT : MVT::integer_valuetypes())
189
190 // We don't accept any truncstore of integer registers.
191 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
193 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
194 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
197
198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
199
200 // SETOEQ and SETUNE require checking two conditions.
201 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 }
205
206 // Integer absolute.
207 if (Subtarget.canUseCMOV()) {
208 setOperationAction(ISD::ABS , MVT::i16 , Custom);
209 setOperationAction(ISD::ABS , MVT::i32 , Custom);
210 if (Subtarget.is64Bit())
211 setOperationAction(ISD::ABS , MVT::i64 , Custom);
212 }
213
214 // Absolute difference.
215 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
216 setOperationAction(Op , MVT::i8 , Custom);
217 setOperationAction(Op , MVT::i16 , Custom);
218 setOperationAction(Op , MVT::i32 , Custom);
219 if (Subtarget.is64Bit())
220 setOperationAction(Op , MVT::i64 , Custom);
221 }
222
223 // Signed saturation subtraction.
227 if (Subtarget.is64Bit())
229
230 // Funnel shifts.
231 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
232 // For slow shld targets we only lower for code size.
233 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
234
235 setOperationAction(ShiftOp , MVT::i8 , Custom);
236 setOperationAction(ShiftOp , MVT::i16 , Custom);
237 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
238 if (Subtarget.is64Bit())
239 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
240 }
241
242 if (!Subtarget.useSoftFloat()) {
243 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
244 // operation.
249 // We have an algorithm for SSE2, and we turn this into a 64-bit
250 // FILD or VCVTUSI2SS/SD for other targets.
253 // We have an algorithm for SSE2->double, and we turn this into a
254 // 64-bit FILD followed by conditional FADD for other targets.
257
258 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
259 // this operation.
262 // SSE has no i16 to fp conversion, only i32. We promote in the handler
263 // to allow f80 to use i16 and f64 to use i16 with sse1 only
266 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
273
274 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
275 // this operation.
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
283 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
284 // are Legal, f80 is custom lowered.
287
288 // Handle FP_TO_UINT by promoting the destination to a larger signed
289 // conversion.
291 // FIXME: This doesn't generate invalid exception when it should. PR44019.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300
301 setOperationAction(ISD::LRINT, MVT::f32, Custom);
302 setOperationAction(ISD::LRINT, MVT::f64, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
304 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
305
306 if (!Subtarget.is64Bit()) {
307 setOperationAction(ISD::LRINT, MVT::i64, Custom);
308 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
309 }
310 }
311
312 if (Subtarget.hasSSE2()) {
313 // Custom lowering for saturating float to int conversions.
314 // We handle promotion to larger result types manually.
315 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
318 }
321 if (Subtarget.is64Bit()) {
324 }
325 }
326 if (Subtarget.hasAVX10_2()) {
331 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
332 MVT::v4i64}) {
335 }
336 if (Subtarget.is64Bit()) {
339 }
340 }
341
342 // Handle address space casts between mixed sized pointers.
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
344 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
345
346 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
347 if (!Subtarget.hasSSE2()) {
348 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
349 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
354 // Without SSE, i64->f64 goes through memory.
355 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
356 }
357 } else if (!Subtarget.is64Bit())
358 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
359
360 // Scalar integer divide and remainder are lowered to use operations that
361 // produce two results, to match the available instructions. This exposes
362 // the two-result form to trivial CSE, which is able to combine x/y and x%y
363 // into a single instruction.
364 //
365 // Scalar integer multiply-high is also lowered to use two-result
366 // operations, to match the available instructions. However, plain multiply
367 // (low) operations are left as Legal, as there are single-result
368 // instructions for this in x86. Using the two-result multiply instructions
369 // when both high and low results are needed must be arranged by dagcombine.
370 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
377 }
378
379 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
380 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
381 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
383 setOperationAction(ISD::BR_CC, VT, Expand);
385 }
386 if (Subtarget.is64Bit())
391
392 setOperationAction(ISD::FREM , MVT::f32 , Expand);
393 setOperationAction(ISD::FREM , MVT::f64 , Expand);
394 setOperationAction(ISD::FREM , MVT::f80 , Expand);
395 setOperationAction(ISD::FREM , MVT::f128 , Expand);
396
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
400 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
402 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
403 }
404
405 // Promote the i8 variants and force them on up to i32 which has a shorter
406 // encoding.
407 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
409 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
410 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
411 // promote that too.
412 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
414
415 if (!Subtarget.hasBMI()) {
416 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
418 if (Subtarget.is64Bit()) {
419 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
421 }
422 }
423
424 if (Subtarget.hasLZCNT()) {
425 // When promoting the i8 variants, force them to i32 for a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
429 } else {
430 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
432 continue;
435 }
436 }
437
438 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
439 ISD::STRICT_FP_TO_FP16}) {
440 // Special handling for half-precision floating point conversions.
441 // If we don't have F16C support, then lower half float conversions
442 // into library calls.
444 Op, MVT::f32,
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
446 // There's never any support for operations beyond MVT::f32.
447 setOperationAction(Op, MVT::f64, Expand);
448 setOperationAction(Op, MVT::f80, Expand);
449 setOperationAction(Op, MVT::f128, Expand);
450 }
451
452 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
453 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
454 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
455 }
456
457 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
459 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
460 setTruncStoreAction(VT, MVT::f16, Expand);
461 setTruncStoreAction(VT, MVT::bf16, Expand);
462
463 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
464 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
465 }
466
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
473 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
474 // popcntw is longer to encode than popcntl and also has a false dependency
475 // on the dest that popcntl hasn't had since Cannon Lake.
476 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
477 } else {
482 }
483
484 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
485
486 if (!Subtarget.hasMOVBE())
488
489 // X86 wants to expand cmov itself.
490 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
495 }
496 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
498 continue;
501 }
502
503 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
506
508 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
509 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
513
514 // Darwin ABI issue.
515 for (auto VT : { MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
524 }
525
526 // 64-bit shl, sra, srl (iff 32-bit x86)
527 for (auto VT : { MVT::i32, MVT::i64 }) {
528 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 continue;
533 }
534
535 if (Subtarget.hasSSEPrefetch())
536 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
537
538 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
539
540 // Expand certain atomics
541 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
547 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
548 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
549 }
550
551 if (!Subtarget.is64Bit())
552 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
553
554 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
555 // All CPUs supporting AVX will atomically load/store aligned 128-bit
556 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
558 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
559 }
560
561 if (Subtarget.canUseCMPXCHG16B())
562 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
563
564 // FIXME - use subtarget debug flags
565 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
566 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
567 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
568 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
569 }
570
573
574 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
575 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
576
577 setOperationAction(ISD::TRAP, MVT::Other, Legal);
578 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
579 if (Subtarget.isTargetPS())
580 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
581 else
582 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
583
584 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
585 setOperationAction(ISD::VASTART , MVT::Other, Custom);
586 setOperationAction(ISD::VAEND , MVT::Other, Expand);
587 bool Is64Bit = Subtarget.is64Bit();
588 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
589 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
590
591 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
592 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
593
594 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
595
596 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
598 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
599
601
602 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
603 setOperationAction(ISD::FABS, VT, Action);
604 setOperationAction(ISD::FNEG, VT, Action);
606 setOperationAction(ISD::FREM, VT, Action);
607 setOperationAction(ISD::FMA, VT, Action);
608 setOperationAction(ISD::FMINNUM, VT, Action);
609 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FMINIMUM, VT, Action);
611 setOperationAction(ISD::FMAXIMUM, VT, Action);
612 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
613 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
614 setOperationAction(ISD::FSIN, VT, Action);
615 setOperationAction(ISD::FCOS, VT, Action);
616 setOperationAction(ISD::FSINCOS, VT, Action);
617 setOperationAction(ISD::FTAN, VT, Action);
618 setOperationAction(ISD::FSQRT, VT, Action);
619 setOperationAction(ISD::FPOW, VT, Action);
620 setOperationAction(ISD::FPOWI, VT, Action);
621 setOperationAction(ISD::FLOG, VT, Action);
622 setOperationAction(ISD::FLOG2, VT, Action);
623 setOperationAction(ISD::FLOG10, VT, Action);
624 setOperationAction(ISD::FEXP, VT, Action);
625 setOperationAction(ISD::FEXP2, VT, Action);
626 setOperationAction(ISD::FEXP10, VT, Action);
627 setOperationAction(ISD::FCEIL, VT, Action);
628 setOperationAction(ISD::FFLOOR, VT, Action);
629 setOperationAction(ISD::FNEARBYINT, VT, Action);
630 setOperationAction(ISD::FRINT, VT, Action);
631 setOperationAction(ISD::BR_CC, VT, Action);
632 setOperationAction(ISD::SETCC, VT, Action);
635 setOperationAction(ISD::FROUND, VT, Action);
636 setOperationAction(ISD::FROUNDEVEN, VT, Action);
637 setOperationAction(ISD::FTRUNC, VT, Action);
638 setOperationAction(ISD::FLDEXP, VT, Action);
639 };
640
641 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
642 // f16, f32 and f64 use SSE.
643 // Set up the FP register classes.
644 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
645 : &X86::FR16RegClass);
646 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
647 : &X86::FR32RegClass);
648 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
649 : &X86::FR64RegClass);
650
651 // Disable f32->f64 extload as we can only generate this in one instruction
652 // under optsize. So its easier to pattern match (fpext (load)) for that
653 // case instead of needing to emit 2 instructions for extload in the
654 // non-optsize case.
655 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
656
657 for (auto VT : { MVT::f32, MVT::f64 }) {
658 // Use ANDPD to simulate FABS.
659 setOperationAction(ISD::FABS, VT, Custom);
660
661 // Use XORP to simulate FNEG.
662 setOperationAction(ISD::FNEG, VT, Custom);
663
664 // Use ANDPD and ORPD to simulate FCOPYSIGN.
666
667 // These might be better off as horizontal vector ops.
670
671 // We don't support sin/cos/fmod
672 setOperationAction(ISD::FSIN , VT, Expand);
673 setOperationAction(ISD::FCOS , VT, Expand);
674 setOperationAction(ISD::FSINCOS, VT, Expand);
675 }
676
677 // Half type will be promoted by default.
678 setF16Action(MVT::f16, Promote);
683 setOperationAction(ISD::FABS, MVT::f16, Custom);
684 setOperationAction(ISD::FNEG, MVT::f16, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
688 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
689
720 setOperationAction(ISD::LRINT, MVT::f16, Expand);
721 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
722
723 // Lower this to MOVMSK plus an AND.
726
727 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
728 (UseX87 || Is64Bit)) {
729 // Use SSE for f32, x87 for f64.
730 // Set up the FP register classes.
731 addRegisterClass(MVT::f32, &X86::FR32RegClass);
732 if (UseX87)
733 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
734
735 // Use ANDPS to simulate FABS.
736 setOperationAction(ISD::FABS , MVT::f32, Custom);
737
738 // Use XORP to simulate FNEG.
739 setOperationAction(ISD::FNEG , MVT::f32, Custom);
740
741 if (UseX87)
743
744 // Use ANDPS and ORPS to simulate FCOPYSIGN.
745 if (UseX87)
748
749 // We don't support sin/cos/fmod
750 setOperationAction(ISD::FSIN , MVT::f32, Expand);
751 setOperationAction(ISD::FCOS , MVT::f32, Expand);
752 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
753
754 if (UseX87) {
755 // Always expand sin/cos functions even though x87 has an instruction.
756 setOperationAction(ISD::FSIN, MVT::f64, Expand);
757 setOperationAction(ISD::FCOS, MVT::f64, Expand);
758 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
759 }
760 } else if (UseX87) {
761 // f32 and f64 in x87.
762 // Set up the FP register classes.
763 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
764 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
765
766 for (auto VT : { MVT::f32, MVT::f64 }) {
769
770 // Always expand sin/cos functions even though x87 has an instruction.
771 setOperationAction(ISD::FSIN , VT, Expand);
772 setOperationAction(ISD::FCOS , VT, Expand);
773 setOperationAction(ISD::FSINCOS, VT, Expand);
774 }
775 }
776
777 // Expand FP32 immediates into loads from the stack, save special cases.
778 if (isTypeLegal(MVT::f32)) {
779 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
780 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
781 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
782 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
783 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
784 } else // SSE immediates.
785 addLegalFPImmediate(APFloat(+0.0f)); // xorps
786 }
787 // Expand FP64 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f64)) {
789 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
790 addLegalFPImmediate(APFloat(+0.0)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0)); // xorpd
796 }
797 // Support fp16 0 immediate.
798 if (isTypeLegal(MVT::f16))
799 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
800
801 // Handle constrained floating-point operations of scalar.
814
815 // We don't support FMA.
818
819 // f80 always uses X87.
820 if (UseX87) {
821 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
824 {
826 addLegalFPImmediate(TmpFlt); // FLD0
827 TmpFlt.changeSign();
828 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
829
830 bool ignored;
831 APFloat TmpFlt2(+1.0);
833 &ignored);
834 addLegalFPImmediate(TmpFlt2); // FLD1
835 TmpFlt2.changeSign();
836 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
837 }
838
839 // Always expand sin/cos functions even though x87 has an instruction.
840 // clang-format off
841 setOperationAction(ISD::FSIN , MVT::f80, Expand);
842 setOperationAction(ISD::FCOS , MVT::f80, Expand);
843 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
844 setOperationAction(ISD::FTAN , MVT::f80, Expand);
845 setOperationAction(ISD::FASIN , MVT::f80, Expand);
846 setOperationAction(ISD::FACOS , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN , MVT::f80, Expand);
848 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
849 setOperationAction(ISD::FSINH , MVT::f80, Expand);
850 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
851 setOperationAction(ISD::FTANH , MVT::f80, Expand);
852 // clang-format on
853
854 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
855 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
856 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
857 setOperationAction(ISD::FRINT, MVT::f80, Expand);
858 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
859 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
861 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
863 setOperationAction(ISD::LRINT, MVT::f80, Custom);
864 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
865
866 // Handle constrained floating-point operations of scalar.
873 if (isTypeLegal(MVT::f16)) {
874 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
876 } else {
878 }
879 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
880 // as Custom.
882 }
883
884 // f128 uses xmm registers, but most operations require libcalls.
885 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
890
901
902 setOperationAction(ISD::FABS, MVT::f128, Custom);
903 setOperationAction(ISD::FNEG, MVT::f128, Custom);
905
906 // clang-format off
907 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
909 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
912 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
914 // clang-format on
915 // No STRICT_FSINCOS
916 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
918
919 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
921 // We need to custom handle any FP_ROUND with an f128 input, but
922 // LegalizeDAG uses the result type to know when to run a custom handler.
923 // So we have to list all legal floating point result types here.
924 if (isTypeLegal(MVT::f32)) {
927 }
928 if (isTypeLegal(MVT::f64)) {
931 }
932 if (isTypeLegal(MVT::f80)) {
936 }
937
939
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
942 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
945 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
946 }
947
948 // Always use a library call for pow.
949 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
952 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
953
954 setOperationAction(ISD::FLOG, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
956 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
959 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
960 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
961 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
962
963 // Some FP actions are always expanded for vector types.
964 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
965 MVT::v4f32, MVT::v8f32, MVT::v16f32,
966 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
967 // clang-format off
968 setOperationAction(ISD::FSIN, VT, Expand);
969 setOperationAction(ISD::FSINCOS, VT, Expand);
970 setOperationAction(ISD::FCOS, VT, Expand);
971 setOperationAction(ISD::FTAN, VT, Expand);
974 setOperationAction(ISD::FPOW, VT, Expand);
975 setOperationAction(ISD::FLOG, VT, Expand);
976 setOperationAction(ISD::FLOG2, VT, Expand);
977 setOperationAction(ISD::FLOG10, VT, Expand);
978 setOperationAction(ISD::FEXP, VT, Expand);
979 setOperationAction(ISD::FEXP2, VT, Expand);
980 setOperationAction(ISD::FEXP10, VT, Expand);
981 // clang-format on
982 }
983
984 // First set operation action for all vector types to either promote
985 // (for widening) or expand (for scalarization). Then we will selectively
986 // turn on ones that can be effectively codegen'd.
997 setOperationAction(ISD::FFLOOR, VT, Expand);
998 setOperationAction(ISD::FCEIL, VT, Expand);
999 setOperationAction(ISD::FTRUNC, VT, Expand);
1000 setOperationAction(ISD::FRINT, VT, Expand);
1001 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1002 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1026 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1027 setTruncStoreAction(InnerVT, VT, Expand);
1028
1029 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1030 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1031
1032 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1033 // types, we have to deal with them whether we ask for Expansion or not.
1034 // Setting Expand causes its own optimisation problems though, so leave
1035 // them legal.
1036 if (VT.getVectorElementType() == MVT::i1)
1037 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1038
1039 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1040 // split/scalarized right now.
1041 if (VT.getVectorElementType() == MVT::f16 ||
1042 VT.getVectorElementType() == MVT::bf16)
1043 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1044 }
1045 }
1046
1047 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1048 // with -msoft-float, disable use of MMX as well.
1049 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1050 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1051 // No operations on x86mmx supported, everything uses intrinsics.
1052 }
1053
1054 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1055 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1056 : &X86::VR128RegClass);
1057
1058 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1061 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1062
1063 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1064 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1072
1073 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1074 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1076
1082 }
1083
1084 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1085 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1086 : &X86::VR128RegClass);
1087
1088 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1089 // registers cannot be used even for integer operations.
1090 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1091 : &X86::VR128RegClass);
1092 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1095 : &X86::VR128RegClass);
1096 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1097 : &X86::VR128RegClass);
1098 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1099 : &X86::VR128RegClass);
1100
1101 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1103 setOperationAction(ISD::FMINIMUM, VT, Custom);
1104 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1105 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1106 }
1107
1108 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1109 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1114 }
1115
1116 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1118 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1119
1120 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1121 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1122 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1123 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1125 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1127 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1129 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1132
1133 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1135 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1136
1137 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1139 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1141
1142 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1143 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1146 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1148 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1150 }
1151
1162
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1174
1175 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1176 // setcc all the way to isel and prefer SETGT in some isel patterns.
1179 }
1180
1181 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1182 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1187
1188 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1194 }
1195
1196 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1200
1201 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1202 continue;
1203
1206 }
1207 setF16Action(MVT::v8f16, Expand);
1208 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1212 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1213 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1215
1216 // Custom lower v2i64 and v2f64 selects.
1223
1230
1231 // Custom legalize these to avoid over promotion or custom promotion.
1232 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1237 }
1238
1243
1246
1249
1250 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1255
1256 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1260
1261 // We want to legalize this to an f64 load rather than an i64 load on
1262 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1263 // store.
1264 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1266 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1267 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1268 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1269 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1270
1271 // Add 32-bit vector stores to help vectorization opportunities.
1272 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1273 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1274
1275 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1277 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1278 if (!Subtarget.hasAVX512())
1279 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1280
1284
1286
1303
1304 // In the customized shift lowering, the legal v4i32/v2i64 cases
1305 // in AVX2 will be recognized.
1306 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1310 if (VT == MVT::v2i64) continue;
1315 }
1316
1322 }
1323
1324 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1329
1330 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1332 }
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1354 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1358 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1362 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1364 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1366
1367 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1455 setOperationAction(ISD::FFLOOR, VT, Legal);
1457 setOperationAction(ISD::FCEIL, VT, Legal);
1459 setOperationAction(ISD::FTRUNC, VT, Legal);
1461 setOperationAction(ISD::FRINT, VT, Legal);
1463 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1465 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1467
1468 setOperationAction(ISD::FROUND, VT, Custom);
1469
1470 setOperationAction(ISD::FNEG, VT, Custom);
1471 setOperationAction(ISD::FABS, VT, Custom);
1473
1474 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1475 setOperationAction(ISD::FMINIMUM, VT, Custom);
1476 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1477 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1479 }
1480
1481 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1482 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1483
1484 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1485 // even though v8i16 is a legal type.
1486 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1489 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1493
1496 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1498 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1500
1512
1513 if (!Subtarget.hasAVX512())
1514 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1515
1516 // In the customized shift lowering, the legal v8i32/v4i64 cases
1517 // in AVX2 will be recognized.
1518 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1524 if (VT == MVT::v4i64) continue;
1529 }
1530
1531 // These types need custom splitting if their input is a 128-bit vector.
1536
1540 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1541 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1544
1545 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1549 }
1550
1555
1556 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1561
1562 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1563 // setcc all the way to isel and prefer SETGT in some isel patterns.
1566 }
1567
1568 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1569 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1574
1575 if (Subtarget.hasAnyFMA()) {
1576 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1577 MVT::v2f64, MVT::v4f64 }) {
1580 }
1581 }
1582
1583 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1584 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1585 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1586 }
1587
1588 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1589 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1592
1593 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1595 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1597 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1598 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1601
1602 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1603 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1604
1605 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1608 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1609 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1610
1611 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1618 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1623
1624 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1625 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1630 }
1631
1632 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1635 }
1636
1637 if (HasInt256) {
1638 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1639 // when we have a 256bit-wide blend with immediate.
1642
1643 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1644 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1645 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1650 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1651 }
1652 }
1653
1654 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1655 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1656 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 }
1659
1660 // Extract subvector is special because the value type
1661 // (result) is 128-bit but the source is 256-bit wide.
1662 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1663 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1665 }
1666
1667 // Custom lower several nodes for 256-bit types.
1668 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1669 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1678 setOperationAction(ISD::STORE, VT, Custom);
1679 }
1680 setF16Action(MVT::v16f16, Expand);
1681 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1684 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1687 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1688
1689 if (HasInt256) {
1691
1692 // Custom legalize 2x32 to get a little better code.
1693 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1694 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1695
1696 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1697 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1698 setOperationAction(ISD::MGATHER, VT, Custom);
1699 }
1700 }
1701
1702 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1703 Subtarget.hasF16C()) {
1704 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1707 }
1708 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1711 }
1712 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1713 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1714 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1715 }
1716 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1717 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1718 }
1719
1720 // This block controls legalization of the mask vector sizes that are
1721 // available with AVX512. 512-bit vectors are in a separate block controlled
1722 // by useAVX512Regs.
1723 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1724 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1725 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1726 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1727 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1728 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1729
1733
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1736 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1741 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1749
1750 // There is no byte sized k-register load or store without AVX512DQ.
1751 if (!Subtarget.hasDQI()) {
1752 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1755 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1756
1757 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1760 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1761 }
1762
1763 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1764 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1768 }
1769
1770 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1772
1773 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1777
1784 }
1785
1786 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1788 }
1789 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1790 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 setOperationAction(ISD::LRINT, VT, Legal);
1792 setOperationAction(ISD::LLRINT, VT, Legal);
1793 }
1794 }
1795
1796 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1797 // elements. 512-bits can be disabled based on prefer-vector-width and
1798 // required-vector-width function attributes.
1799 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1800 bool HasBWI = Subtarget.hasBWI();
1801
1802 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1808 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1809
1810 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1815 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1816 if (HasBWI)
1817 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1818 }
1819
1820 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1821 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1822 setOperationAction(ISD::FMINIMUM, VT, Custom);
1823 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1825 setOperationAction(ISD::FNEG, VT, Custom);
1826 setOperationAction(ISD::FABS, VT, Custom);
1831 }
1832 setOperationAction(ISD::LRINT, MVT::v16f32,
1833 Subtarget.hasDQI() ? Legal : Custom);
1834 setOperationAction(ISD::LRINT, MVT::v8f64,
1835 Subtarget.hasDQI() ? Legal : Custom);
1836 if (Subtarget.hasDQI())
1837 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1838
1839 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1844 }
1845
1846 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1851 }
1852
1857 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1859
1871
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1874 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1876 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1877 if (HasBWI)
1878 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1879
1880 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1881 // to 512-bit rather than use the AVX2 instructions so that we can use
1882 // k-masks.
1883 if (!Subtarget.hasVLX()) {
1884 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1885 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1886 setOperationAction(ISD::MLOAD, VT, Custom);
1887 setOperationAction(ISD::MSTORE, VT, Custom);
1888 }
1889 }
1890
1892 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1893 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1903
1904 if (HasBWI) {
1905 // Extends from v64i1 masks to 512-bit vectors.
1909 }
1910
1911 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1912 setOperationAction(ISD::FFLOOR, VT, Legal);
1914 setOperationAction(ISD::FCEIL, VT, Legal);
1916 setOperationAction(ISD::FTRUNC, VT, Legal);
1918 setOperationAction(ISD::FRINT, VT, Legal);
1920 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1922 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1924
1925 setOperationAction(ISD::FROUND, VT, Custom);
1926 }
1927
1928 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1931 }
1932
1933 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1936 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1937
1938 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1939 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1940 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1942
1943 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1945 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1947 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1948 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1953 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1954
1955 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1965
1966 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1967 // setcc all the way to isel and prefer SETGT in some isel patterns.
1970 }
1971
1972 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1973 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1978
1979 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1986 }
1987
1988 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1989 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1992 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1995 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2000 }
2001
2002 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2004 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2006 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2007 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2008
2009 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2013 setOperationAction(Opc, MVT::v8i64, Custom);
2014
2015 if (Subtarget.hasDQI())
2016 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2017
2018 if (Subtarget.hasCDI()) {
2019 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2020 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2022 }
2023 } // Subtarget.hasCDI()
2024
2025 if (Subtarget.hasVPOPCNTDQ()) {
2026 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2028 }
2029
2030 // Extract subvector is special because the value type
2031 // (result) is 256-bit but the source is 512-bit wide.
2032 // 128-bit was made Legal under AVX1.
2033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2034 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2036
2037 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2038 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2048 }
2049 setF16Action(MVT::v32f16, Expand);
2052 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2054 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2055 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2056 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2057
2058 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2059 setOperationAction(ISD::MLOAD, VT, Legal);
2060 setOperationAction(ISD::MSTORE, VT, Legal);
2061 setOperationAction(ISD::MGATHER, VT, Custom);
2062 setOperationAction(ISD::MSCATTER, VT, Custom);
2063 }
2064 if (HasBWI) {
2065 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2066 setOperationAction(ISD::MLOAD, VT, Legal);
2067 setOperationAction(ISD::MSTORE, VT, Legal);
2068 }
2069 } else {
2070 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2071 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2072 }
2073
2074 if (Subtarget.hasVBMI2()) {
2075 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2078 }
2079
2080 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2081 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2082 }
2083
2084 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2085 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2087 }// useAVX512Regs
2088
2089 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2090 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2091 MVT::v4i64}) {
2094 }
2095 }
2096
2097 // This block controls legalization for operations that don't have
2098 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2099 // narrower widths.
2100 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2101 // These operations are handled on non-VLX by artificially widening in
2102 // isel patterns.
2103
2107
2108 if (Subtarget.hasDQI()) {
2109 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2110 // v2f32 UINT_TO_FP is already custom under SSE2.
2113 "Unexpected operation action!");
2114 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2119 }
2120
2121 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2132 }
2133
2134 // Custom legalize 2x32 to get a little better code.
2135 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2136 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2137
2138 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2139 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2140 setOperationAction(ISD::MSCATTER, VT, Custom);
2141
2142 if (Subtarget.hasDQI()) {
2146 setOperationAction(Opc, MVT::v2i64, Custom);
2147 setOperationAction(Opc, MVT::v4i64, Custom);
2148 }
2149 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2150 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2151 }
2152
2153 if (Subtarget.hasCDI()) {
2154 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2156 }
2157 } // Subtarget.hasCDI()
2158
2159 if (Subtarget.hasVPOPCNTDQ()) {
2160 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2162 }
2163
2164 // We can try to convert vectors to different sizes to leverage legal
2165 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2166 // then specialize to Legal below.
2167 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2168 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2169 MVT::v16i16, MVT::v8i8})
2171
2172 // Legal vpcompress depends on various AVX512 extensions.
2173 // Legal in AVX512F
2174 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2176
2177 // Legal in AVX512F + AVX512VL
2178 if (Subtarget.hasVLX())
2179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2180 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2182
2183 // Legal in AVX512F + AVX512VBMI2
2184 if (Subtarget.hasVBMI2())
2185 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2187
2188 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2189 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2190 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2192 }
2193
2194 // This block control legalization of v32i1/v64i1 which are available with
2195 // AVX512BW..
2196 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2197 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2198 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2199
2200 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2211 }
2212
2213 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2215
2216 // Extends from v32i1 masks to 256-bit vectors.
2220
2221 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2222 MVT::v16f16, MVT::v8f16}) {
2223 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2225 }
2226
2227 // These operations are handled on non-VLX by artificially widening in
2228 // isel patterns.
2229 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2230
2231 if (Subtarget.hasBITALG()) {
2232 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2234 }
2235 }
2236
2237 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2238 auto setGroup = [&] (MVT VT) {
2247 setOperationAction(ISD::FSQRT, VT, Legal);
2249
2250 setOperationAction(ISD::FFLOOR, VT, Legal);
2252 setOperationAction(ISD::FCEIL, VT, Legal);
2254 setOperationAction(ISD::FTRUNC, VT, Legal);
2256 setOperationAction(ISD::FRINT, VT, Legal);
2258 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2260 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2262
2263 setOperationAction(ISD::FROUND, VT, Custom);
2264
2265 setOperationAction(ISD::LOAD, VT, Legal);
2266 setOperationAction(ISD::STORE, VT, Legal);
2267
2273
2274 setOperationAction(ISD::FNEG, VT, Custom);
2275 setOperationAction(ISD::FABS, VT, Custom);
2279
2283 };
2284
2285 // AVX512_FP16 scalar operations
2286 setGroup(MVT::f16);
2290 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2292 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2296 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2300 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2302 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2303 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2304
2307
2308 if (Subtarget.useAVX512Regs()) {
2309 setGroup(MVT::v32f16);
2315 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2317 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2319 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2322
2327 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2329 MVT::v32i16);
2330 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2332 MVT::v32i16);
2333 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2335 MVT::v32i16);
2336 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2338 MVT::v32i16);
2339
2343
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2345 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2346
2347 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2351 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2352 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2353 }
2354
2359
2360 if (Subtarget.hasVLX()) {
2361 setGroup(MVT::v8f16);
2362 setGroup(MVT::v16f16);
2363
2374
2377 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2379 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2381
2382 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2385
2389
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2393 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2394
2395 // Need to custom widen these to prevent scalarization.
2396 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2397 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2398
2399 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2402 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2403
2404 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2408 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2409 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2410 }
2411 }
2412
2413 if (!Subtarget.useSoftFloat() &&
2414 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2415 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2416 : &X86::VR128RegClass);
2417 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2418 : &X86::VR256RegClass);
2419 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2420 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2421 // Set the operation action Custom to do the customization later.
2424 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2425 setF16Action(VT, Expand);
2426 if (!Subtarget.hasBF16())
2432 }
2433 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2434 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2435 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2436 }
2437 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2438 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2440 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2441 }
2442
2443 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2444 Subtarget.useAVX512Regs()) {
2445 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2446 setF16Action(MVT::v32bf16, Expand);
2447 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2448 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2449 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2451 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2455 }
2456
2457 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2458 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2464 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2468 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2469 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2474 setOperationAction(ISD::FSQRT, VT, Legal);
2477 setOperationAction(ISD::FMINIMUM, VT, Custom);
2478 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2479 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2480 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2481 }
2482 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2485 }
2486 }
2487
2488 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2491 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2493 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2494
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2497 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2499 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2500
2501 if (Subtarget.hasBWI()) {
2502 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2503 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2504 }
2505
2506 if (Subtarget.hasFP16()) {
2507 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2516 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2525 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2530 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2531 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2533 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2535 }
2536 }
2537
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2539 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2540 }
2541
2542 // We want to custom lower some of our intrinsics.
2546 if (!Subtarget.is64Bit()) {
2548 }
2549
2550 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2551 // handle type legalization for these operations here.
2552 //
2553 // FIXME: We really should do custom legalization for addition and
2554 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2555 // than generic legalization for 64-bit multiplication-with-overflow, though.
2556 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2557 if (VT == MVT::i64 && !Subtarget.is64Bit())
2558 continue;
2559 // Add/Sub/Mul with overflow operations are custom lowered.
2566
2567 // Support carry in as value rather than glue.
2573 }
2574
2575 // Combine sin / cos into _sincos_stret if it is available.
2576 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2577 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2578 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2579 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2580 }
2581
2582 if (Subtarget.isTargetWin64()) {
2583 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2585 setOperationAction(ISD::SREM, MVT::i128, Custom);
2586 setOperationAction(ISD::UREM, MVT::i128, Custom);
2595 }
2596
2597 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2598 // is. We should promote the value to 64-bits to solve this.
2599 // This is what the CRT headers do - `fmodf` is an inline header
2600 // function casting to f64 and calling `fmod`.
2601 if (Subtarget.is32Bit() &&
2602 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2603 // clang-format off
2604 for (ISD::NodeType Op :
2605 {ISD::FACOS, ISD::STRICT_FACOS,
2606 ISD::FASIN, ISD::STRICT_FASIN,
2607 ISD::FATAN, ISD::STRICT_FATAN,
2608 ISD::FATAN2, ISD::STRICT_FATAN2,
2609 ISD::FCEIL, ISD::STRICT_FCEIL,
2610 ISD::FCOS, ISD::STRICT_FCOS,
2611 ISD::FCOSH, ISD::STRICT_FCOSH,
2612 ISD::FEXP, ISD::STRICT_FEXP,
2613 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2615 ISD::FLOG, ISD::STRICT_FLOG,
2616 ISD::FLOG10, ISD::STRICT_FLOG10,
2617 ISD::FPOW, ISD::STRICT_FPOW,
2618 ISD::FSIN, ISD::STRICT_FSIN,
2619 ISD::FSINH, ISD::STRICT_FSINH,
2620 ISD::FTAN, ISD::STRICT_FTAN,
2621 ISD::FTANH, ISD::STRICT_FTANH,
2622 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2623 ISD::FMODF})
2624 if (isOperationExpand(Op, MVT::f32))
2625 setOperationAction(Op, MVT::f32, Promote);
2626 // clang-format on
2627
2628 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2629 // it, but it's just a wrapper around ldexp.
2630 if (Subtarget.isOSWindows()) {
2631 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2632 if (isOperationExpand(Op, MVT::f32))
2633 setOperationAction(Op, MVT::f32, Promote);
2634 }
2635
2636 // We have target-specific dag combine patterns for the following nodes:
2644 ISD::BITCAST,
2647 ISD::SHL,
2648 ISD::SRA,
2649 ISD::SRL,
2650 ISD::OR,
2651 ISD::AND,
2657 ISD::ADD,
2658 ISD::FADD,
2659 ISD::FSUB,
2660 ISD::FNEG,
2661 ISD::FMA,
2663 ISD::FMINNUM,
2664 ISD::FMAXNUM,
2665 ISD::SUB,
2666 ISD::LOAD,
2667 ISD::LRINT,
2668 ISD::LLRINT,
2669 ISD::MLOAD,
2670 ISD::STORE,
2671 ISD::MSTORE,
2687 ISD::SETCC,
2688 ISD::MUL,
2689 ISD::XOR,
2690 ISD::MSCATTER,
2691 ISD::MGATHER,
2692 ISD::FP16_TO_FP,
2693 ISD::FP_EXTEND,
2700
2701 computeRegisterProperties(Subtarget.getRegisterInfo());
2702
2703 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2705 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2707 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2709
2710 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2711 // that needs to benchmarked and balanced with the potential use of vector
2712 // load/store types (PR33329, PR33914).
2715
2716 // Default loop alignment, which can be overridden by -align-loops.
2718
2719 // An out-of-order CPU can speculatively execute past a predictable branch,
2720 // but a conditional move could be stalled by an expensive earlier operation.
2721 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2722 EnableExtLdPromotion = true;
2724
2726
2727 // Default to having -disable-strictnode-mutation on
2728 IsStrictFPEnabled = true;
2729}
2730
2731// This has so far only been implemented for 64-bit MachO.
2733 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2734}
2735
2737 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2738 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2739}
2740
2742 const SDLoc &DL) const {
2743 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2744 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2745 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2746 return SDValue(Node, 0);
2747}
2748
2751 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2752 !Subtarget.hasBWI())
2753 return TypeSplitVector;
2754
2755 // Since v8f16 is legal, widen anything over v4f16.
2756 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2757 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2758 VT.getVectorElementType() == MVT::f16)
2759 return TypeSplitVector;
2760
2761 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2762 VT.getVectorElementType() != MVT::i1)
2763 return TypeWidenVector;
2764
2766}
2767
2768FastISel *
2770 const TargetLibraryInfo *libInfo) const {
2771 return X86::createFastISel(funcInfo, libInfo);
2772}
2773
2774//===----------------------------------------------------------------------===//
2775// Other Lowering Hooks
2776//===----------------------------------------------------------------------===//
2777
2779 bool AssumeSingleUse) {
2780 if (!AssumeSingleUse && !Op.hasOneUse())
2781 return false;
2782 if (!ISD::isNormalLoad(Op.getNode()))
2783 return false;
2784
2785 // If this is an unaligned vector, make sure the target supports folding it.
2786 auto *Ld = cast<LoadSDNode>(Op.getNode());
2787 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2788 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2789 return false;
2790
2791 // TODO: If this is a non-temporal load and the target has an instruction
2792 // for it, it should not be folded. See "useNonTemporalLoad()".
2793
2794 return true;
2795}
2796
2798 const X86Subtarget &Subtarget,
2799 bool AssumeSingleUse) {
2800 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2801 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2802 return false;
2803
2804 // We can not replace a wide volatile load with a broadcast-from-memory,
2805 // because that would narrow the load, which isn't legal for volatiles.
2806 auto *Ld = cast<LoadSDNode>(Op.getNode());
2807 return !Ld->isVolatile() ||
2808 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2809}
2810
2812 if (!Op.hasOneUse())
2813 return false;
2814 // Peek through (oneuse) bitcast users
2815 SDNode *User = *Op->user_begin();
2816 while (User->getOpcode() == ISD::BITCAST) {
2817 if (!User->hasOneUse())
2818 return false;
2819 User = *User->user_begin();
2820 }
2821 return ISD::isNormalStore(User);
2822}
2823
2825 if (Op.hasOneUse()) {
2826 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2827 return (ISD::ZERO_EXTEND == Opcode);
2828 }
2829 return false;
2830}
2831
2832static bool isLogicOp(unsigned Opcode) {
2833 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2834 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2835}
2836
2837static bool isTargetShuffle(unsigned Opcode) {
2838 switch(Opcode) {
2839 default: return false;
2840 case X86ISD::BLENDI:
2841 case X86ISD::PSHUFB:
2842 case X86ISD::PSHUFD:
2843 case X86ISD::PSHUFHW:
2844 case X86ISD::PSHUFLW:
2845 case X86ISD::SHUFP:
2846 case X86ISD::INSERTPS:
2847 case X86ISD::EXTRQI:
2848 case X86ISD::INSERTQI:
2849 case X86ISD::VALIGN:
2850 case X86ISD::PALIGNR:
2851 case X86ISD::VSHLDQ:
2852 case X86ISD::VSRLDQ:
2853 case X86ISD::MOVLHPS:
2854 case X86ISD::MOVHLPS:
2855 case X86ISD::MOVSHDUP:
2856 case X86ISD::MOVSLDUP:
2857 case X86ISD::MOVDDUP:
2858 case X86ISD::MOVSS:
2859 case X86ISD::MOVSD:
2860 case X86ISD::MOVSH:
2861 case X86ISD::UNPCKL:
2862 case X86ISD::UNPCKH:
2863 case X86ISD::VBROADCAST:
2864 case X86ISD::VPERMILPI:
2865 case X86ISD::VPERMILPV:
2866 case X86ISD::VPERM2X128:
2867 case X86ISD::SHUF128:
2868 case X86ISD::VPERMIL2:
2869 case X86ISD::VPERMI:
2870 case X86ISD::VPPERM:
2871 case X86ISD::VPERMV:
2872 case X86ISD::VPERMV3:
2873 case X86ISD::VZEXT_MOVL:
2874 return true;
2875 }
2876}
2877
2878static bool isTargetShuffleVariableMask(unsigned Opcode) {
2879 switch (Opcode) {
2880 default: return false;
2881 // Target Shuffles.
2882 case X86ISD::PSHUFB:
2883 case X86ISD::VPERMILPV:
2884 case X86ISD::VPERMIL2:
2885 case X86ISD::VPPERM:
2886 case X86ISD::VPERMV:
2887 case X86ISD::VPERMV3:
2888 return true;
2889 // 'Faux' Target Shuffles.
2890 case ISD::OR:
2891 case ISD::AND:
2892 case X86ISD::ANDNP:
2893 return true;
2894 }
2895}
2896
2899 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2901 int ReturnAddrIndex = FuncInfo->getRAIndex();
2902
2903 if (ReturnAddrIndex == 0) {
2904 // Set up a frame object for the return address.
2905 unsigned SlotSize = RegInfo->getSlotSize();
2906 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2907 -(int64_t)SlotSize,
2908 false);
2909 FuncInfo->setRAIndex(ReturnAddrIndex);
2910 }
2911
2912 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2913}
2914
2916 bool HasSymbolicDisplacement) {
2917 // Offset should fit into 32 bit immediate field.
2918 if (!isInt<32>(Offset))
2919 return false;
2920
2921 // If we don't have a symbolic displacement - we don't have any extra
2922 // restrictions.
2923 if (!HasSymbolicDisplacement)
2924 return true;
2925
2926 // We can fold large offsets in the large code model because we always use
2927 // 64-bit offsets.
2928 if (CM == CodeModel::Large)
2929 return true;
2930
2931 // For kernel code model we know that all object resist in the negative half
2932 // of 32bits address space. We may not accept negative offsets, since they may
2933 // be just off and we may accept pretty large positive ones.
2934 if (CM == CodeModel::Kernel)
2935 return Offset >= 0;
2936
2937 // For other non-large code models we assume that latest small object is 16MB
2938 // before end of 31 bits boundary. We may also accept pretty large negative
2939 // constants knowing that all objects are in the positive half of address
2940 // space.
2941 return Offset < 16 * 1024 * 1024;
2942}
2943
2944/// Return true if the condition is an signed comparison operation.
2945static bool isX86CCSigned(X86::CondCode X86CC) {
2946 switch (X86CC) {
2947 default:
2948 llvm_unreachable("Invalid integer condition!");
2949 case X86::COND_E:
2950 case X86::COND_NE:
2951 case X86::COND_B:
2952 case X86::COND_A:
2953 case X86::COND_BE:
2954 case X86::COND_AE:
2955 return false;
2956 case X86::COND_G:
2957 case X86::COND_GE:
2958 case X86::COND_L:
2959 case X86::COND_LE:
2960 return true;
2961 }
2962}
2963
2965 switch (SetCCOpcode) {
2966 // clang-format off
2967 default: llvm_unreachable("Invalid integer condition!");
2968 case ISD::SETEQ: return X86::COND_E;
2969 case ISD::SETGT: return X86::COND_G;
2970 case ISD::SETGE: return X86::COND_GE;
2971 case ISD::SETLT: return X86::COND_L;
2972 case ISD::SETLE: return X86::COND_LE;
2973 case ISD::SETNE: return X86::COND_NE;
2974 case ISD::SETULT: return X86::COND_B;
2975 case ISD::SETUGT: return X86::COND_A;
2976 case ISD::SETULE: return X86::COND_BE;
2977 case ISD::SETUGE: return X86::COND_AE;
2978 // clang-format on
2979 }
2980}
2981
2982/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2983/// condition code, returning the condition code and the LHS/RHS of the
2984/// comparison to make.
2986 bool isFP, SDValue &LHS, SDValue &RHS,
2987 SelectionDAG &DAG) {
2988 if (!isFP) {
2990 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2991 // X > -1 -> X == 0, jump !sign.
2992 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2993 return X86::COND_NS;
2994 }
2995 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2996 // X < 0 -> X == 0, jump on sign.
2997 return X86::COND_S;
2998 }
2999 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3000 // X >= 0 -> X == 0, jump on !sign.
3001 return X86::COND_NS;
3002 }
3003 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3004 // X < 1 -> X <= 0
3005 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3006 return X86::COND_LE;
3007 }
3008 }
3009
3010 return TranslateIntegerX86CC(SetCCOpcode);
3011 }
3012
3013 // First determine if it is required or is profitable to flip the operands.
3014
3015 // If LHS is a foldable load, but RHS is not, flip the condition.
3016 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3017 !ISD::isNON_EXTLoad(RHS.getNode())) {
3018 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3019 std::swap(LHS, RHS);
3020 }
3021
3022 switch (SetCCOpcode) {
3023 default: break;
3024 case ISD::SETOLT:
3025 case ISD::SETOLE:
3026 case ISD::SETUGT:
3027 case ISD::SETUGE:
3028 std::swap(LHS, RHS);
3029 break;
3030 }
3031
3032 // On a floating point condition, the flags are set as follows:
3033 // ZF PF CF op
3034 // 0 | 0 | 0 | X > Y
3035 // 0 | 0 | 1 | X < Y
3036 // 1 | 0 | 0 | X == Y
3037 // 1 | 1 | 1 | unordered
3038 switch (SetCCOpcode) {
3039 // clang-format off
3040 default: llvm_unreachable("Condcode should be pre-legalized away");
3041 case ISD::SETUEQ:
3042 case ISD::SETEQ: return X86::COND_E;
3043 case ISD::SETOLT: // flipped
3044 case ISD::SETOGT:
3045 case ISD::SETGT: return X86::COND_A;
3046 case ISD::SETOLE: // flipped
3047 case ISD::SETOGE:
3048 case ISD::SETGE: return X86::COND_AE;
3049 case ISD::SETUGT: // flipped
3050 case ISD::SETULT:
3051 case ISD::SETLT: return X86::COND_B;
3052 case ISD::SETUGE: // flipped
3053 case ISD::SETULE:
3054 case ISD::SETLE: return X86::COND_BE;
3055 case ISD::SETONE:
3056 case ISD::SETNE: return X86::COND_NE;
3057 case ISD::SETUO: return X86::COND_P;
3058 case ISD::SETO: return X86::COND_NP;
3059 case ISD::SETOEQ:
3060 case ISD::SETUNE: return X86::COND_INVALID;
3061 // clang-format on
3062 }
3063}
3064
3065/// Is there a floating point cmov for the specific X86 condition code?
3066/// Current x86 isa includes the following FP cmov instructions:
3067/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3068static bool hasFPCMov(unsigned X86CC) {
3069 switch (X86CC) {
3070 default:
3071 return false;
3072 case X86::COND_B:
3073 case X86::COND_BE:
3074 case X86::COND_E:
3075 case X86::COND_P:
3076 case X86::COND_A:
3077 case X86::COND_AE:
3078 case X86::COND_NE:
3079 case X86::COND_NP:
3080 return true;
3081 }
3082}
3083
3084static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3085 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3086 VT.is512BitVector();
3087}
3088
3090 const CallInst &I,
3091 MachineFunction &MF,
3092 unsigned Intrinsic) const {
3093 Info.flags = MachineMemOperand::MONone;
3094 Info.offset = 0;
3095
3097 if (!IntrData) {
3098 switch (Intrinsic) {
3099 case Intrinsic::x86_aesenc128kl:
3100 case Intrinsic::x86_aesdec128kl:
3101 Info.opc = ISD::INTRINSIC_W_CHAIN;
3102 Info.ptrVal = I.getArgOperand(1);
3103 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3104 Info.align = Align(1);
3105 Info.flags |= MachineMemOperand::MOLoad;
3106 return true;
3107 case Intrinsic::x86_aesenc256kl:
3108 case Intrinsic::x86_aesdec256kl:
3109 Info.opc = ISD::INTRINSIC_W_CHAIN;
3110 Info.ptrVal = I.getArgOperand(1);
3111 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3112 Info.align = Align(1);
3113 Info.flags |= MachineMemOperand::MOLoad;
3114 return true;
3115 case Intrinsic::x86_aesencwide128kl:
3116 case Intrinsic::x86_aesdecwide128kl:
3117 Info.opc = ISD::INTRINSIC_W_CHAIN;
3118 Info.ptrVal = I.getArgOperand(0);
3119 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3120 Info.align = Align(1);
3121 Info.flags |= MachineMemOperand::MOLoad;
3122 return true;
3123 case Intrinsic::x86_aesencwide256kl:
3124 case Intrinsic::x86_aesdecwide256kl:
3125 Info.opc = ISD::INTRINSIC_W_CHAIN;
3126 Info.ptrVal = I.getArgOperand(0);
3127 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3128 Info.align = Align(1);
3129 Info.flags |= MachineMemOperand::MOLoad;
3130 return true;
3131 case Intrinsic::x86_cmpccxadd32:
3132 case Intrinsic::x86_cmpccxadd64:
3133 case Intrinsic::x86_atomic_bts:
3134 case Intrinsic::x86_atomic_btc:
3135 case Intrinsic::x86_atomic_btr: {
3136 Info.opc = ISD::INTRINSIC_W_CHAIN;
3137 Info.ptrVal = I.getArgOperand(0);
3138 unsigned Size = I.getType()->getScalarSizeInBits();
3139 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3140 Info.align = Align(Size);
3143 return true;
3144 }
3145 case Intrinsic::x86_atomic_bts_rm:
3146 case Intrinsic::x86_atomic_btc_rm:
3147 case Intrinsic::x86_atomic_btr_rm: {
3148 Info.opc = ISD::INTRINSIC_W_CHAIN;
3149 Info.ptrVal = I.getArgOperand(0);
3150 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3151 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3152 Info.align = Align(Size);
3155 return true;
3156 }
3157 case Intrinsic::x86_aadd32:
3158 case Intrinsic::x86_aadd64:
3159 case Intrinsic::x86_aand32:
3160 case Intrinsic::x86_aand64:
3161 case Intrinsic::x86_aor32:
3162 case Intrinsic::x86_aor64:
3163 case Intrinsic::x86_axor32:
3164 case Intrinsic::x86_axor64:
3165 case Intrinsic::x86_atomic_add_cc:
3166 case Intrinsic::x86_atomic_sub_cc:
3167 case Intrinsic::x86_atomic_or_cc:
3168 case Intrinsic::x86_atomic_and_cc:
3169 case Intrinsic::x86_atomic_xor_cc: {
3170 Info.opc = ISD::INTRINSIC_W_CHAIN;
3171 Info.ptrVal = I.getArgOperand(0);
3172 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3173 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3174 Info.align = Align(Size);
3177 return true;
3178 }
3179 }
3180 return false;
3181 }
3182
3183 switch (IntrData->Type) {
3186 case TRUNCATE_TO_MEM_VI32: {
3187 Info.opc = ISD::INTRINSIC_VOID;
3188 Info.ptrVal = I.getArgOperand(0);
3189 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3191 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3192 ScalarVT = MVT::i8;
3193 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3194 ScalarVT = MVT::i16;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3196 ScalarVT = MVT::i32;
3197
3198 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3199 Info.align = Align(1);
3200 Info.flags |= MachineMemOperand::MOStore;
3201 break;
3202 }
3203 case GATHER:
3204 case GATHER_AVX2: {
3205 Info.opc = ISD::INTRINSIC_W_CHAIN;
3206 Info.ptrVal = nullptr;
3207 MVT DataVT = MVT::getVT(I.getType());
3208 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3209 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3210 IndexVT.getVectorNumElements());
3211 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3212 Info.align = Align(1);
3213 Info.flags |= MachineMemOperand::MOLoad;
3214 break;
3215 }
3216 case SCATTER: {
3217 Info.opc = ISD::INTRINSIC_VOID;
3218 Info.ptrVal = nullptr;
3219 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3220 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3221 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3222 IndexVT.getVectorNumElements());
3223 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3224 Info.align = Align(1);
3225 Info.flags |= MachineMemOperand::MOStore;
3226 break;
3227 }
3228 default:
3229 return false;
3230 }
3231
3232 return true;
3233}
3234
3235/// Returns true if the target can instruction select the
3236/// specified FP immediate natively. If false, the legalizer will
3237/// materialize the FP immediate as a load from a constant pool.
3239 bool ForCodeSize) const {
3240 for (const APFloat &FPImm : LegalFPImmediates)
3241 if (Imm.bitwiseIsEqual(FPImm))
3242 return true;
3243 return false;
3244}
3245
3247 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3248 std::optional<unsigned> ByteOffset) const {
3249 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3250
3251 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3252 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3253 N = *N->user_begin();
3254 return N;
3255 };
3256
3257 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3258 // relocation target a movq or addq instruction: don't let the load shrink.
3259 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3260 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3261 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3262 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3263
3264 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3265 // those uses are extracted directly into a store, then the extract + store
3266 // can be store-folded, or (4) any use will be used by legal full width
3267 // instruction. Then, it's probably not worth splitting the load.
3268 EVT VT = Load->getValueType(0);
3269 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3270 !SDValue(Load, 0).hasOneUse()) {
3271 bool FullWidthUse = false;
3272 bool AllExtractStores = true;
3273 for (SDUse &Use : Load->uses()) {
3274 // Skip uses of the chain value. Result 0 of the node is the load value.
3275 if (Use.getResNo() != 0)
3276 continue;
3277
3278 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3279
3280 // If this use is an extract + store, it's probably not worth splitting.
3281 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3282 all_of(User->uses(), [&](const SDUse &U) {
3283 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3284 return Inner->getOpcode() == ISD::STORE;
3285 }))
3286 continue;
3287
3288 AllExtractStores = false;
3289
3290 // If any use is a full width legal/target bin op, then assume its legal
3291 // and won't split.
3292 if (isBinOp(User->getOpcode()) &&
3293 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3294 User->getOpcode() > ISD::BUILTIN_OP_END))
3295 FullWidthUse = true;
3296 }
3297
3298 if (AllExtractStores)
3299 return false;
3300
3301 // If we have an user that uses the full vector width, then this use is
3302 // only worth splitting if the offset isn't 0 (to avoid an
3303 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3304 if (FullWidthUse)
3305 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3306 }
3307
3308 return true;
3309}
3310
3311/// Returns true if it is beneficial to convert a load of a constant
3312/// to just the constant itself.
3314 Type *Ty) const {
3315 assert(Ty->isIntegerTy());
3316
3317 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3318 if (BitSize == 0 || BitSize > 64)
3319 return false;
3320 return true;
3321}
3322
3324 // If we are using XMM registers in the ABI and the condition of the select is
3325 // a floating-point compare and we have blendv or conditional move, then it is
3326 // cheaper to select instead of doing a cross-register move and creating a
3327 // load that depends on the compare result.
3328 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3329 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3330}
3331
3333 // TODO: It might be a win to ease or lift this restriction, but the generic
3334 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3335 if (VT.isVector() && Subtarget.hasAVX512())
3336 return false;
3337
3338 return true;
3339}
3340
3342 SDValue C) const {
3343 // TODO: We handle scalars using custom code, but generic combining could make
3344 // that unnecessary.
3345 APInt MulC;
3346 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3347 return false;
3348
3349 // Find the type this will be legalized too. Otherwise we might prematurely
3350 // convert this to shl+add/sub and then still have to type legalize those ops.
3351 // Another choice would be to defer the decision for illegal types until
3352 // after type legalization. But constant splat vectors of i64 can't make it
3353 // through type legalization on 32-bit targets so we would need to special
3354 // case vXi64.
3355 while (getTypeAction(Context, VT) != TypeLegal)
3356 VT = getTypeToTransformTo(Context, VT);
3357
3358 // If vector multiply is legal, assume that's faster than shl + add/sub.
3359 // Multiply is a complex op with higher latency and lower throughput in
3360 // most implementations, sub-vXi32 vector multiplies are always fast,
3361 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3362 // is always going to be slow.
3363 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3364 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3365 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3366 return false;
3367
3368 // shl+add, shl+sub, shl+add+neg
3369 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3370 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3371}
3372
3374 unsigned Index) const {
3376 return false;
3377
3378 // Mask vectors support all subregister combinations and operations that
3379 // extract half of vector.
3380 if (ResVT.getVectorElementType() == MVT::i1)
3381 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3382 (Index == ResVT.getVectorNumElements()));
3383
3384 return (Index % ResVT.getVectorNumElements()) == 0;
3385}
3386
3388 unsigned Opc = VecOp.getOpcode();
3389
3390 // Assume target opcodes can't be scalarized.
3391 // TODO - do we have any exceptions?
3392 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3393 return false;
3394
3395 // If the vector op is not supported, try to convert to scalar.
3396 EVT VecVT = VecOp.getValueType();
3398 return true;
3399
3400 // If the vector op is supported, but the scalar op is not, the transform may
3401 // not be worthwhile.
3402 EVT ScalarVT = VecVT.getScalarType();
3403 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3404}
3405
3407 bool) const {
3408 // TODO: Allow vectors?
3409 if (VT.isVector())
3410 return false;
3411 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3412}
3413
3415 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3416 // i32/i64 or can rely on BSF passthrough value.
3417 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3418 Subtarget.hasBitScanPassThrough() ||
3419 (!Ty->isVectorTy() &&
3420 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3421}
3422
3424 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3425 // passthrough value.
3426 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3427 Subtarget.hasBitScanPassThrough();
3428}
3429
3431 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3432 // expensive than a straight movsd. On the other hand, it's important to
3433 // shrink long double fp constant since fldt is very slow.
3434 return !Subtarget.hasSSE2() || VT == MVT::f80;
3435}
3436
3438 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3439 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3440}
3441
3443 const SelectionDAG &DAG,
3444 const MachineMemOperand &MMO) const {
3445 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3446 BitcastVT.getVectorElementType() == MVT::i1)
3447 return false;
3448
3449 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3450 return false;
3451
3452 // If both types are legal vectors, it's always ok to convert them.
3453 if (LoadVT.isVector() && BitcastVT.isVector() &&
3454 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3455 return true;
3456
3457 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3458}
3459
3461 const MachineFunction &MF) const {
3462 // Do not merge to float value size (128 bytes) if no implicit
3463 // float attribute is set.
3464 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3465
3466 if (NoFloat) {
3467 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3468 return (MemVT.getSizeInBits() <= MaxIntSize);
3469 }
3470 // Make sure we don't merge greater than our preferred vector
3471 // width.
3472 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3473 return false;
3474
3475 return true;
3476}
3477
3479 return Subtarget.hasFastLZCNT();
3480}
3481
3483 const Instruction &AndI) const {
3484 return true;
3485}
3486
3488 EVT VT = Y.getValueType();
3489
3490 if (VT.isVector())
3491 return false;
3492
3493 if (!Subtarget.hasBMI())
3494 return false;
3495
3496 // There are only 32-bit and 64-bit forms for 'andn'.
3497 if (VT != MVT::i32 && VT != MVT::i64)
3498 return false;
3499
3500 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3501}
3502
3504 EVT VT = Y.getValueType();
3505
3506 if (!VT.isVector())
3507 return hasAndNotCompare(Y);
3508
3509 // Vector.
3510
3511 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3512 return false;
3513
3514 if (VT == MVT::v4i32)
3515 return true;
3516
3517 return Subtarget.hasSSE2();
3518}
3519
3521 return X.getValueType().isScalarInteger(); // 'bt'
3522}
3523
3527 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3528 SelectionDAG &DAG) const {
3529 // Does baseline recommend not to perform the fold by default?
3531 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3532 return false;
3533 // For scalars this transform is always beneficial.
3534 if (X.getValueType().isScalarInteger())
3535 return true;
3536 // If all the shift amounts are identical, then transform is beneficial even
3537 // with rudimentary SSE2 shifts.
3538 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3539 return true;
3540 // If we have AVX2 with it's powerful shift operations, then it's also good.
3541 if (Subtarget.hasAVX2())
3542 return true;
3543 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3544 return NewShiftOpcode == ISD::SHL;
3545}
3546
3548 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3549 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3550 if (!VT.isInteger())
3551 return ShiftOpc;
3552
3553 bool PreferRotate = false;
3554 if (VT.isVector()) {
3555 // For vectors, if we have rotate instruction support, then its definetly
3556 // best. Otherwise its not clear what the best so just don't make changed.
3557 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3558 VT.getScalarType() == MVT::i64);
3559 } else {
3560 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3561 // rotate unless we have a zext mask+shr.
3562 PreferRotate = Subtarget.hasBMI2();
3563 if (!PreferRotate) {
3564 unsigned MaskBits =
3565 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3566 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3567 }
3568 }
3569
3570 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3571 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3572
3573 if (PreferRotate && MayTransformRotate)
3574 return ISD::ROTL;
3575
3576 // If vector we don't really get much benefit swapping around constants.
3577 // Maybe we could check if the DAG has the flipped node already in the
3578 // future.
3579 if (VT.isVector())
3580 return ShiftOpc;
3581
3582 // See if the beneficial to swap shift type.
3583 if (ShiftOpc == ISD::SHL) {
3584 // If the current setup has imm64 mask, then inverse will have
3585 // at least imm32 mask (or be zext i32 -> i64).
3586 if (VT == MVT::i64)
3587 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3588 : ShiftOpc;
3589
3590 // We can only benefit if req at least 7-bit for the mask. We
3591 // don't want to replace shl of 1,2,3 as they can be implemented
3592 // with lea/add.
3593 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3594 }
3595
3596 if (VT == MVT::i64)
3597 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3598 // extremely efficient.
3599 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3600
3601 // Keep small shifts as shl so we can generate add/lea.
3602 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3603 }
3604
3605 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3606 // (PreferRotate will be set in the latter case).
3607 if (PreferRotate || !MayTransformRotate || VT.isVector())
3608 return ShiftOpc;
3609
3610 // Non-vector type and we have a zext mask with SRL.
3611 return ISD::SRL;
3612}
3613
3616 const Value *Lhs,
3617 const Value *Rhs) const {
3618 using namespace llvm::PatternMatch;
3619 int BaseCost = BrMergingBaseCostThresh.getValue();
3620 // With CCMP, branches can be merged in a more efficient way.
3621 if (BaseCost >= 0 && Subtarget.hasCCMP())
3622 BaseCost += BrMergingCcmpBias;
3623 // a == b && a == c is a fast pattern on x86.
3624 if (BaseCost >= 0 && Opc == Instruction::And &&
3627 BaseCost += 1;
3628 return {BaseCost, BrMergingLikelyBias.getValue(),
3629 BrMergingUnlikelyBias.getValue()};
3630}
3631
3633 return N->getOpcode() != ISD::FP_EXTEND;
3634}
3635
3637 const SDNode *N, CombineLevel Level) const {
3638 assert(((N->getOpcode() == ISD::SHL &&
3639 N->getOperand(0).getOpcode() == ISD::SRL) ||
3640 (N->getOpcode() == ISD::SRL &&
3641 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3642 "Expected shift-shift mask");
3643 // TODO: Should we always create i64 masks? Or only folded immediates?
3644 EVT VT = N->getValueType(0);
3645 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3646 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3647 // Only fold if the shift values are equal - so it folds to AND.
3648 // TODO - we should fold if either is a non-uniform vector but we don't do
3649 // the fold for non-splats yet.
3650 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3651 }
3653}
3654
3656 EVT VT = Y.getValueType();
3657
3658 // For vectors, we don't have a preference, but we probably want a mask.
3659 if (VT.isVector())
3660 return false;
3661
3662 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3663 if (VT == MVT::i64 && !Subtarget.is64Bit())
3664 return false;
3665
3666 return true;
3667}
3668
3671 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3673 !Subtarget.isOSWindows())
3676 ExpansionFactor);
3677}
3678
3680 // Any legal vector type can be splatted more efficiently than
3681 // loading/spilling from memory.
3682 return isTypeLegal(VT);
3683}
3684
3686 MVT VT = MVT::getIntegerVT(NumBits);
3687 if (isTypeLegal(VT))
3688 return VT;
3689
3690 // PMOVMSKB can handle this.
3691 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3692 return MVT::v16i8;
3693
3694 // VPMOVMSKB can handle this.
3695 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3696 return MVT::v32i8;
3697
3698 // TODO: Allow 64-bit type for 32-bit target.
3699 // TODO: 512-bit types should be allowed, but make sure that those
3700 // cases are handled in combineVectorSizedSetCCEquality().
3701
3703}
3704
3705/// Val is the undef sentinel value or equal to the specified value.
3706static bool isUndefOrEqual(int Val, int CmpVal) {
3707 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3708}
3709
3710/// Return true if every element in Mask is the undef sentinel value or equal to
3711/// the specified value.
3712static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3713 return llvm::all_of(Mask, [CmpVal](int M) {
3714 return (M == SM_SentinelUndef) || (M == CmpVal);
3715 });
3716}
3717
3718/// Return true if every element in Mask, beginning from position Pos and ending
3719/// in Pos+Size is the undef sentinel value or equal to the specified value.
3720static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3721 unsigned Size) {
3722 return llvm::all_of(Mask.slice(Pos, Size),
3723 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3724}
3725
3726/// Val is either the undef or zero sentinel value.
3727static bool isUndefOrZero(int Val) {
3728 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3729}
3730
3731/// Return true if every element in Mask, beginning from position Pos and ending
3732/// in Pos+Size is the undef sentinel value.
3733static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3734 return llvm::all_of(Mask.slice(Pos, Size),
3735 [](int M) { return M == SM_SentinelUndef; });
3736}
3737
3738/// Return true if the mask creates a vector whose lower half is undefined.
3740 unsigned NumElts = Mask.size();
3741 return isUndefInRange(Mask, 0, NumElts / 2);
3742}
3743
3744/// Return true if the mask creates a vector whose upper half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3748}
3749
3750/// Return true if Val falls within the specified range (L, H].
3751static bool isInRange(int Val, int Low, int Hi) {
3752 return (Val >= Low && Val < Hi);
3753}
3754
3755/// Return true if the value of any element in Mask falls within the specified
3756/// range (L, H].
3757static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3758 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3759}
3760
3761/// Return true if the value of any element in Mask is the zero sentinel value.
3762static bool isAnyZero(ArrayRef<int> Mask) {
3763 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3764}
3765
3766/// Return true if Val is undef or if its value falls within the
3767/// specified range (L, H].
3768static bool isUndefOrInRange(int Val, int Low, int Hi) {
3769 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3770}
3771
3772/// Return true if every element in Mask is undef or if its value
3773/// falls within the specified range (L, H].
3774static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3775 return llvm::all_of(
3776 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3777}
3778
3779/// Return true if Val is undef, zero or if its value falls within the
3780/// specified range (L, H].
3781static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3782 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3783}
3784
3785/// Return true if every element in Mask is undef, zero or if its value
3786/// falls within the specified range (L, H].
3787static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3788 return llvm::all_of(
3789 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3790}
3791
3792/// Return true if every element in Mask, is an in-place blend/select mask or is
3793/// undef.
3795 unsigned NumElts = Mask.size();
3796 for (auto [I, M] : enumerate(Mask))
3797 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3798 return false;
3799 return true;
3800}
3801
3802/// Return true if every element in Mask, beginning
3803/// from position Pos and ending in Pos + Size, falls within the specified
3804/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3805static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3806 unsigned Size, int Low, int Step = 1) {
3807 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3808 if (!isUndefOrEqual(Mask[i], Low))
3809 return false;
3810 return true;
3811}
3812
3813/// Return true if every element in Mask, beginning
3814/// from position Pos and ending in Pos+Size, falls within the specified
3815/// sequential range (Low, Low+Size], or is undef or is zero.
3817 unsigned Size, int Low,
3818 int Step = 1) {
3819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3821 return false;
3822 return true;
3823}
3824
3825/// Return true if every element in Mask, beginning
3826/// from position Pos and ending in Pos+Size is undef or is zero.
3827static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3828 unsigned Size) {
3829 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3830}
3831
3832/// Return true if every element of a single input is referenced by the shuffle
3833/// mask. i.e. it just permutes them all.
3835 unsigned NumElts = Mask.size();
3836 APInt DemandedElts = APInt::getZero(NumElts);
3837 for (int M : Mask)
3838 if (isInRange(M, 0, NumElts))
3839 DemandedElts.setBit(M);
3840 return DemandedElts.isAllOnes();
3841}
3842
3843/// Helper function to test whether a shuffle mask could be
3844/// simplified by widening the elements being shuffled.
3845///
3846/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3847/// leaves it in an unspecified state.
3848///
3849/// NOTE: This must handle normal vector shuffle masks and *target* vector
3850/// shuffle masks. The latter have the special property of a '-2' representing
3851/// a zero-ed lane of a vector.
3853 SmallVectorImpl<int> &WidenedMask) {
3854 WidenedMask.assign(Mask.size() / 2, 0);
3855 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3856 int M0 = Mask[i];
3857 int M1 = Mask[i + 1];
3858
3859 // If both elements are undef, its trivial.
3860 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3861 WidenedMask[i / 2] = SM_SentinelUndef;
3862 continue;
3863 }
3864
3865 // Check for an undef mask and a mask value properly aligned to fit with
3866 // a pair of values. If we find such a case, use the non-undef mask's value.
3867 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3868 WidenedMask[i / 2] = M1 / 2;
3869 continue;
3870 }
3871 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3872 WidenedMask[i / 2] = M0 / 2;
3873 continue;
3874 }
3875
3876 // When zeroing, we need to spread the zeroing across both lanes to widen.
3877 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3878 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3880 WidenedMask[i / 2] = SM_SentinelZero;
3881 continue;
3882 }
3883 return false;
3884 }
3885
3886 // Finally check if the two mask values are adjacent and aligned with
3887 // a pair.
3888 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3889 WidenedMask[i / 2] = M0 / 2;
3890 continue;
3891 }
3892
3893 // Otherwise we can't safely widen the elements used in this shuffle.
3894 return false;
3895 }
3896 assert(WidenedMask.size() == Mask.size() / 2 &&
3897 "Incorrect size of mask after widening the elements!");
3898
3899 return true;
3900}
3901
3903 const APInt &Zeroable,
3904 bool V2IsZero,
3905 SmallVectorImpl<int> &WidenedMask) {
3906 // Create an alternative mask with info about zeroable elements.
3907 // Here we do not set undef elements as zeroable.
3908 SmallVector<int, 64> ZeroableMask(Mask);
3909 if (V2IsZero) {
3910 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3911 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3912 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3913 ZeroableMask[i] = SM_SentinelZero;
3914 }
3915 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3916}
3917
3919 SmallVector<int, 32> WidenedMask;
3920 return canWidenShuffleElements(Mask, WidenedMask);
3921}
3922
3923// Attempt to narrow/widen shuffle mask until it matches the target number of
3924// elements.
3925static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3926 SmallVectorImpl<int> &ScaledMask) {
3927 unsigned NumSrcElts = Mask.size();
3928 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3929 "Illegal shuffle scale factor");
3930
3931 // Narrowing is guaranteed to work.
3932 if (NumDstElts >= NumSrcElts) {
3933 int Scale = NumDstElts / NumSrcElts;
3934 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3935 return true;
3936 }
3937
3938 // We have to repeat the widening until we reach the target size, but we can
3939 // split out the first widening as it sets up ScaledMask for us.
3940 if (canWidenShuffleElements(Mask, ScaledMask)) {
3941 while (ScaledMask.size() > NumDstElts) {
3942 SmallVector<int, 16> WidenedMask;
3943 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3944 return false;
3945 ScaledMask = std::move(WidenedMask);
3946 }
3947 return true;
3948 }
3949
3950 return false;
3951}
3952
3953static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3954 SmallVector<int, 32> ScaledMask;
3955 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3956}
3957
3958// Helper to grow the shuffle mask for a larger value type.
3959// NOTE: This is different to scaleShuffleElements which is a same size type.
3960static void growShuffleMask(ArrayRef<int> SrcMask,
3961 SmallVectorImpl<int> &DstMask,
3962 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3963 assert(DstMask.empty() && "Expected an empty shuffle mas");
3964 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3965 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3966 unsigned NumSrcElts = SrcMask.size();
3967 DstMask.assign(SrcMask.begin(), SrcMask.end());
3968 for (int &M : DstMask) {
3969 if (M < 0)
3970 continue;
3971 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3972 }
3973 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3974}
3975
3976/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3978 return isNullConstant(Elt) || isNullFPConstant(Elt);
3979}
3980
3981// Build a vector of constants.
3982// Use an UNDEF node if MaskElt == -1.
3983// Split 64-bit constants in the 32-bit mode.
3985 const SDLoc &dl, bool IsMask = false) {
3986
3988 bool Split = false;
3989
3990 MVT ConstVecVT = VT;
3991 unsigned NumElts = VT.getVectorNumElements();
3992 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3993 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3994 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3995 Split = true;
3996 }
3997
3998 MVT EltVT = ConstVecVT.getVectorElementType();
3999 for (unsigned i = 0; i < NumElts; ++i) {
4000 bool IsUndef = Values[i] < 0 && IsMask;
4001 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4002 DAG.getConstant(Values[i], dl, EltVT);
4003 Ops.push_back(OpNode);
4004 if (Split)
4005 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4006 DAG.getConstant(0, dl, EltVT));
4007 }
4008 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4009 if (Split)
4010 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4011 return ConstsNode;
4012}
4013
4014static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4015 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4016 assert(Bits.size() == Undefs.getBitWidth() &&
4017 "Unequal constant and undef arrays");
4019 bool Split = false;
4020
4021 MVT ConstVecVT = VT;
4022 unsigned NumElts = VT.getVectorNumElements();
4023 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4024 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4025 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4026 Split = true;
4027 }
4028
4029 MVT EltVT = ConstVecVT.getVectorElementType();
4030 MVT EltIntVT = EltVT.changeTypeToInteger();
4031 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4032 if (Undefs[i]) {
4033 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4034 continue;
4035 }
4036 const APInt &V = Bits[i];
4037 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4038 if (Split) {
4039 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4040 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4041 } else {
4042 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4043 }
4044 }
4045
4046 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4047 return DAG.getBitcast(VT, ConstsNode);
4048}
4049
4051 SelectionDAG &DAG, const SDLoc &dl) {
4052 APInt Undefs = APInt::getZero(Bits.size());
4053 return getConstVector(Bits, Undefs, VT, DAG, dl);
4054}
4055
4056/// Returns a vector of specified type with all zero elements.
4057static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4058 SelectionDAG &DAG, const SDLoc &dl) {
4059 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4060 VT.getVectorElementType() == MVT::i1) &&
4061 "Unexpected vector type");
4062
4063 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4064 // type. This ensures they get CSE'd. But if the integer type is not
4065 // available, use a floating-point +0.0 instead.
4066 SDValue Vec;
4067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4068 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4069 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4070 } else if (VT.isFloatingPoint() &&
4072 Vec = DAG.getConstantFP(+0.0, dl, VT);
4073 } else if (VT.getVectorElementType() == MVT::i1) {
4074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4075 "Unexpected vector type");
4076 Vec = DAG.getConstant(0, dl, VT);
4077 } else {
4078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4080 }
4081 return DAG.getBitcast(VT, Vec);
4082}
4083
4084// Helper to determine if the ops are all the extracted subvectors come from a
4085// single source. If we allow commute they don't have to be in order (Lo/Hi).
4086static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4087 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4088 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4089 LHS.getValueType() != RHS.getValueType() ||
4090 LHS.getOperand(0) != RHS.getOperand(0))
4091 return SDValue();
4092
4093 SDValue Src = LHS.getOperand(0);
4094 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4095 return SDValue();
4096
4097 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4098 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4099 RHS.getConstantOperandAPInt(1) == NumElts) ||
4100 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4101 LHS.getConstantOperandAPInt(1) == NumElts))
4102 return Src;
4103
4104 return SDValue();
4105}
4106
4107static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4108 const SDLoc &dl, unsigned vectorWidth) {
4109 EVT VT = Vec.getValueType();
4110 EVT ElVT = VT.getVectorElementType();
4111 unsigned ResultNumElts =
4112 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4113 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4114
4115 assert(ResultVT.getSizeInBits() == vectorWidth &&
4116 "Illegal subvector extraction");
4117
4118 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4119 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 // If the input is a buildvector just emit a smaller one.
4127 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4128 return DAG.getBuildVector(ResultVT, dl,
4129 Vec->ops().slice(IdxVal, ElemsPerChunk));
4130
4131 // Check if we're extracting the upper undef of a widening pattern.
4132 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4133 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4134 isNullConstant(Vec.getOperand(2)))
4135 return DAG.getUNDEF(ResultVT);
4136
4137 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4138}
4139
4140/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4141/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4142/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4143/// instructions or a simple subregister reference. Idx is an index in the
4144/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4145/// lowering EXTRACT_VECTOR_ELT operations easier.
4146static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4147 SelectionDAG &DAG, const SDLoc &dl) {
4149 Vec.getValueType().is512BitVector()) &&
4150 "Unexpected vector size!");
4151 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Generate a DAG to grab 256-bits from a 512-bit vector.
4155static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4156 SelectionDAG &DAG, const SDLoc &dl) {
4157 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4158 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4159}
4160
4161static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl,
4163 unsigned vectorWidth) {
4164 assert((vectorWidth == 128 || vectorWidth == 256) &&
4165 "Unsupported vector width");
4166 // Inserting UNDEF is Result
4167 if (Vec.isUndef())
4168 return Result;
4169
4170 // Insert the relevant vectorWidth bits.
4171 EVT VT = Vec.getValueType();
4172 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4173 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4174
4175 // This is the index of the first element of the vectorWidth-bit chunk
4176 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4177 IdxVal &= ~(ElemsPerChunk - 1);
4178 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4179}
4180
4181/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4182/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4183/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4184/// simple superregister reference. Idx is an index in the 128 bits
4185/// we want. It need not be aligned to a 128-bit boundary. That makes
4186/// lowering INSERT_VECTOR_ELT operations easier.
4187static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4188 SelectionDAG &DAG, const SDLoc &dl) {
4189 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4190 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4191}
4192
4193/// Widen a vector to a larger size with the same scalar type, with the new
4194/// elements either zero or undef.
4195static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4196 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4197 const SDLoc &dl) {
4198 EVT VecVT = Vec.getValueType();
4200 VecVT.getScalarType() == VT.getScalarType() &&
4201 "Unsupported vector widening type");
4202 // If the upper 128-bits of a build vector are already undef/zero, then try to
4203 // widen from the lower 128-bits.
4204 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4205 unsigned NumSrcElts = VecVT.getVectorNumElements();
4206 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4207 if (all_of(Hi, [&](SDValue V) {
4208 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4209 }))
4210 Vec = extract128BitVector(Vec, 0, DAG, dl);
4211 }
4212 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4213 : DAG.getUNDEF(VT);
4214 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4215}
4216
4217/// Widen a vector to a larger size with the same scalar type, with the new
4218/// elements either zero or undef.
4219static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4220 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4221 const SDLoc &dl, unsigned WideSizeInBits) {
4222 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4223 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4224 "Unsupported vector widening type");
4225 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4226 MVT SVT = Vec.getSimpleValueType().getScalarType();
4227 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4228 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4229}
4230
4231/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4232/// and bitcast with integer types.
4233static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4234 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4235 unsigned NumElts = VT.getVectorNumElements();
4236 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4237 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4238 return VT;
4239}
4240
4241/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4242/// bitcast with integer types.
4243static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4244 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4245 const SDLoc &dl) {
4246 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4247 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4248}
4249
4250// Helper function to collect subvector ops that are concatenated together,
4251// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4252// The subvectors in Ops are guaranteed to be the same type.
4254 SelectionDAG &DAG) {
4255 assert(Ops.empty() && "Expected an empty ops vector");
4256
4257 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4258 Ops.append(N->op_begin(), N->op_end());
4259 return true;
4260 }
4261
4262 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4263 SDValue Src = N->getOperand(0);
4264 SDValue Sub = N->getOperand(1);
4265 const APInt &Idx = N->getConstantOperandAPInt(2);
4266 EVT VT = Src.getValueType();
4267 EVT SubVT = Sub.getValueType();
4268
4269 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4270 // insert_subvector(undef, x, lo)
4271 if (Idx == 0 && Src.isUndef()) {
4272 Ops.push_back(Sub);
4273 Ops.push_back(DAG.getUNDEF(SubVT));
4274 return true;
4275 }
4276 if (Idx == (VT.getVectorNumElements() / 2)) {
4277 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4278 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4279 Src.getOperand(1).getValueType() == SubVT &&
4280 isNullConstant(Src.getOperand(2))) {
4281 // Attempt to recurse into inner (matching) concats.
4282 SDValue Lo = Src.getOperand(1);
4283 SDValue Hi = Sub;
4284 SmallVector<SDValue, 2> LoOps, HiOps;
4285 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4286 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4287 LoOps.size() == HiOps.size()) {
4288 Ops.append(LoOps);
4289 Ops.append(HiOps);
4290 return true;
4291 }
4292 Ops.push_back(Lo);
4293 Ops.push_back(Hi);
4294 return true;
4295 }
4296 // insert_subvector(x, extract_subvector(x, lo), hi)
4297 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4298 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4299 Ops.append(2, Sub);
4300 return true;
4301 }
4302 // insert_subvector(undef, x, hi)
4303 if (Src.isUndef()) {
4304 Ops.push_back(DAG.getUNDEF(SubVT));
4305 Ops.push_back(Sub);
4306 return true;
4307 }
4308 }
4309 }
4310 }
4311
4312 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4313 EVT VT = N->getValueType(0);
4314 SDValue Src = N->getOperand(0);
4315 uint64_t Idx = N->getConstantOperandVal(1);
4316
4317 // Collect all the subvectors from the source vector and slice off the
4318 // extraction.
4320 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4321 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4322 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4323 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4324 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4325 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4326 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4327 return true;
4328 }
4329 }
4330
4331 assert(Ops.empty() && "Expected an empty ops vector");
4332 return false;
4333}
4334
4335// Helper to check if \p V can be split into subvectors and the upper subvectors
4336// are all undef. In which case return the lower subvector.
4338 SelectionDAG &DAG) {
4339 SmallVector<SDValue> SubOps;
4340 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4341 return SDValue();
4342
4343 unsigned NumSubOps = SubOps.size();
4344 unsigned HalfNumSubOps = NumSubOps / 2;
4345 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4346
4347 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4348 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4349 return SDValue();
4350
4351 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4352 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4353 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4354}
4355
4356// Helper to check if we can access all the constituent subvectors without any
4357// extract ops.
4360 return collectConcatOps(V.getNode(), Ops, DAG);
4361}
4362
4363static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4364 const SDLoc &dl) {
4365 EVT VT = Op.getValueType();
4366 unsigned NumElems = VT.getVectorNumElements();
4367 unsigned SizeInBits = VT.getSizeInBits();
4368 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4369 "Can't split odd sized vector");
4370
4372 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4373 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4374 unsigned HalfOps = SubOps.size() / 2;
4375 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4376 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4377 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4378 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4379 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4380 return std::make_pair(Lo, Hi);
4381 }
4382
4383 // If this is a splat value (with no-undefs) then use the lower subvector,
4384 // which should be a free extraction.
4385 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4386 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4387 return std::make_pair(Lo, Lo);
4388
4389 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4390 return std::make_pair(Lo, Hi);
4391}
4392
4393/// Break an operation into 2 half sized ops and then concatenate the results.
4395 unsigned NumOps = Op.getNumOperands();
4396 EVT VT = Op.getValueType();
4397
4398 // Extract the LHS Lo/Hi vectors
4401 for (unsigned I = 0; I != NumOps; ++I) {
4402 SDValue SrcOp = Op.getOperand(I);
4403 if (!SrcOp.getValueType().isVector()) {
4404 LoOps[I] = HiOps[I] = SrcOp;
4405 continue;
4406 }
4407 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4408 }
4409
4410 EVT LoVT, HiVT;
4411 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4412 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4413 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4414 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4415}
4416
4417/// Break an unary integer operation into 2 half sized ops and then
4418/// concatenate the result back.
4420 const SDLoc &dl) {
4421 // Make sure we only try to split 256/512-bit types to avoid creating
4422 // narrow vectors.
4423 [[maybe_unused]] EVT VT = Op.getValueType();
4424 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4425 Op.getOperand(0).getValueType().is512BitVector()) &&
4426 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4427 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4428 VT.getVectorNumElements() &&
4429 "Unexpected VTs!");
4430 return splitVectorOp(Op, DAG, dl);
4431}
4432
4433/// Break a binary integer operation into 2 half sized ops and then
4434/// concatenate the result back.
4436 const SDLoc &dl) {
4437 // Assert that all the types match.
4438 [[maybe_unused]] EVT VT = Op.getValueType();
4439 assert(Op.getOperand(0).getValueType() == VT &&
4440 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4441 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4442 return splitVectorOp(Op, DAG, dl);
4443}
4444
4445// Helper for splitting operands of an operation to legal target size and
4446// apply a function on each part.
4447// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4448// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4449// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4450// The argument Builder is a function that will be applied on each split part:
4451// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4452template <typename F>
4454 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4455 F Builder, bool CheckBWI = true) {
4456 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4457 unsigned NumSubs = 1;
4458 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4459 (!CheckBWI && Subtarget.useAVX512Regs())) {
4460 if (VT.getSizeInBits() > 512) {
4461 NumSubs = VT.getSizeInBits() / 512;
4462 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4463 }
4464 } else if (Subtarget.hasAVX2()) {
4465 if (VT.getSizeInBits() > 256) {
4466 NumSubs = VT.getSizeInBits() / 256;
4467 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4468 }
4469 } else {
4470 if (VT.getSizeInBits() > 128) {
4471 NumSubs = VT.getSizeInBits() / 128;
4472 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4473 }
4474 }
4475
4476 if (NumSubs == 1)
4477 return Builder(DAG, DL, Ops);
4478
4480 for (unsigned i = 0; i != NumSubs; ++i) {
4482 for (SDValue Op : Ops) {
4483 EVT OpVT = Op.getValueType();
4484 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4485 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4486 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4487 }
4488 Subs.push_back(Builder(DAG, DL, SubOps));
4489 }
4490 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4491}
4492
4493// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4494// targets.
4495static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4497 const X86Subtarget &Subtarget) {
4498 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4499 MVT SVT = VT.getScalarType();
4500
4501 // If we have a 32/64 splatted constant, splat it to DstTy to
4502 // encourage a foldable broadcast'd operand.
4503 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4504 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4505 // AVX512 broadcasts 32/64-bit operands.
4506 // TODO: Support float once getAVX512Node is used by fp-ops.
4507 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4509 return SDValue();
4510 // If we're not widening, don't bother if we're not bitcasting.
4511 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4512 return SDValue();
4514 APInt SplatValue, SplatUndef;
4515 unsigned SplatBitSize;
4516 bool HasAnyUndefs;
4517 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4518 HasAnyUndefs, OpEltSizeInBits) &&
4519 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4520 return DAG.getConstant(SplatValue, DL, DstVT);
4521 }
4522 return SDValue();
4523 };
4524
4525 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4526
4527 MVT DstVT = VT;
4528 if (Widen)
4529 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4530
4531 // Canonicalize src operands.
4532 SmallVector<SDValue> SrcOps(Ops);
4533 for (SDValue &Op : SrcOps) {
4534 MVT OpVT = Op.getSimpleValueType();
4535 // Just pass through scalar operands.
4536 if (!OpVT.isVector())
4537 continue;
4538 assert(OpVT == VT && "Vector type mismatch");
4539
4540 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4541 Op = BroadcastOp;
4542 continue;
4543 }
4544
4545 // Just widen the subvector by inserting into an undef wide vector.
4546 if (Widen)
4547 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4548 }
4549
4550 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4551
4552 // Perform the 512-bit op then extract the bottom subvector.
4553 if (Widen)
4554 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4555 return Res;
4556}
4557
4558/// Insert i1-subvector to i1-vector.
4560 const X86Subtarget &Subtarget) {
4561
4562 SDLoc dl(Op);
4563 SDValue Vec = Op.getOperand(0);
4564 SDValue SubVec = Op.getOperand(1);
4565 SDValue Idx = Op.getOperand(2);
4566 unsigned IdxVal = Op.getConstantOperandVal(2);
4567
4568 // Inserting undef is a nop. We can just return the original vector.
4569 if (SubVec.isUndef())
4570 return Vec;
4571
4572 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4573 return Op;
4574
4575 MVT OpVT = Op.getSimpleValueType();
4576 unsigned NumElems = OpVT.getVectorNumElements();
4577 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4578
4579 // Extend to natively supported kshift.
4580 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4581
4582 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4583 // if necessary.
4584 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4585 // May need to promote to a legal type.
4586 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4587 DAG.getConstant(0, dl, WideOpVT),
4588 SubVec, Idx);
4589 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4590 }
4591
4592 MVT SubVecVT = SubVec.getSimpleValueType();
4593 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4594 assert(IdxVal + SubVecNumElems <= NumElems &&
4595 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4596 "Unexpected index value in INSERT_SUBVECTOR");
4597
4598 SDValue Undef = DAG.getUNDEF(WideOpVT);
4599
4600 if (IdxVal == 0) {
4601 // Zero lower bits of the Vec
4602 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4603 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4604 ZeroIdx);
4605 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4606 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4607 // Merge them together, SubVec should be zero extended.
4608 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4609 DAG.getConstant(0, dl, WideOpVT),
4610 SubVec, ZeroIdx);
4611 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4612 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4613 }
4614
4615 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4616 Undef, SubVec, ZeroIdx);
4617
4618 if (Vec.isUndef()) {
4619 assert(IdxVal != 0 && "Unexpected index");
4620 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4621 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4623 }
4624
4626 assert(IdxVal != 0 && "Unexpected index");
4627 // If upper elements of Vec are known undef, then just shift into place.
4628 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4629 [](SDValue V) { return V.isUndef(); })) {
4630 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4631 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4632 } else {
4633 NumElems = WideOpVT.getVectorNumElements();
4634 unsigned ShiftLeft = NumElems - SubVecNumElems;
4635 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4636 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4637 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4638 if (ShiftRight != 0)
4639 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4640 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4641 }
4642 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4643 }
4644
4645 // Simple case when we put subvector in the upper part
4646 if (IdxVal + SubVecNumElems == NumElems) {
4647 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4648 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4649 if (SubVecNumElems * 2 == NumElems) {
4650 // Special case, use legal zero extending insert_subvector. This allows
4651 // isel to optimize when bits are known zero.
4652 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4653 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4654 DAG.getConstant(0, dl, WideOpVT),
4655 Vec, ZeroIdx);
4656 } else {
4657 // Otherwise use explicit shifts to zero the bits.
4658 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4659 Undef, Vec, ZeroIdx);
4660 NumElems = WideOpVT.getVectorNumElements();
4661 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4662 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4663 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4664 }
4665 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4667 }
4668
4669 // Inserting into the middle is more complicated.
4670
4671 NumElems = WideOpVT.getVectorNumElements();
4672
4673 // Widen the vector if needed.
4674 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4675
4676 unsigned ShiftLeft = NumElems - SubVecNumElems;
4677 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4678
4679 // Do an optimization for the most frequently used types.
4680 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4681 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4682 Mask0.flipAllBits();
4683 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4684 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4685 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4686 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4687 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4688 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4689 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4690 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4691
4692 // Reduce to original width if needed.
4693 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4694 }
4695
4696 // Clear the upper bits of the subvector and move it to its insert position.
4697 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4698 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4699 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4700 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4701
4702 // Isolate the bits below the insertion point.
4703 unsigned LowShift = NumElems - IdxVal;
4704 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4705 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4706 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4707 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4708
4709 // Isolate the bits after the last inserted bit.
4710 unsigned HighShift = IdxVal + SubVecNumElems;
4711 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4712 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4713 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4714 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4715
4716 // Now OR all 3 pieces together.
4717 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4718 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4719
4720 // Reduce to original width if needed.
4721 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4722}
4723
4725 const SDLoc &dl) {
4726 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4727 EVT SubVT = V1.getValueType();
4728 EVT SubSVT = SubVT.getScalarType();
4729 unsigned SubNumElts = SubVT.getVectorNumElements();
4730 unsigned SubVectorWidth = SubVT.getSizeInBits();
4731 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4732 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4733 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4734}
4735
4736/// Returns a vector of specified type with all bits set.
4737/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4738/// Then bitcast to their original type, ensuring they get CSE'd.
4739static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4740 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4741 "Expected a 128/256/512-bit vector type");
4742 unsigned NumElts = VT.getSizeInBits() / 32;
4743 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4744 return DAG.getBitcast(VT, Vec);
4745}
4746
4747static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4748 SDValue In, SelectionDAG &DAG) {
4749 EVT InVT = In.getValueType();
4750 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4751
4752 // Canonicalize Opcode to general extension version.
4753 switch (Opcode) {
4754 case ISD::ANY_EXTEND:
4756 Opcode = ISD::ANY_EXTEND;
4757 break;
4758 case ISD::SIGN_EXTEND:
4760 Opcode = ISD::SIGN_EXTEND;
4761 break;
4762 case ISD::ZERO_EXTEND:
4764 Opcode = ISD::ZERO_EXTEND;
4765 break;
4766 default:
4767 llvm_unreachable("Unknown extension opcode");
4768 }
4769
4770 // For 256-bit vectors, we only need the lower (128-bit) input half.
4771 // For 512-bit vectors, we only need the lower input half or quarter.
4772 if (InVT.getSizeInBits() > 128) {
4773 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4774 "Expected VTs to be the same size!");
4775 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4776 In = extractSubVector(In, 0, DAG, DL,
4777 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4778 InVT = In.getValueType();
4779 }
4780
4781 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4782 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4783
4784 return DAG.getNode(Opcode, DL, VT, In);
4785}
4786
4787// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4789 SDValue Mask, SelectionDAG &DAG) {
4790 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4791 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4792 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4793}
4794
4796 bool Lo, bool Unary) {
4797 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4798 "Illegal vector type to unpack");
4799 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4800 int NumElts = VT.getVectorNumElements();
4801 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4802 for (int i = 0; i < NumElts; ++i) {
4803 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4804 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4805 Pos += (Unary ? 0 : NumElts * (i % 2));
4806 Pos += (Lo ? 0 : NumEltsInLane / 2);
4807 Mask.push_back(Pos);
4808 }
4809}
4810
4811/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4812/// imposed by AVX and specific to the unary pattern. Example:
4813/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4814/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4816 bool Lo) {
4817 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4818 int NumElts = VT.getVectorNumElements();
4819 for (int i = 0; i < NumElts; ++i) {
4820 int Pos = i / 2;
4821 Pos += (Lo ? 0 : NumElts / 2);
4822 Mask.push_back(Pos);
4823 }
4824}
4825
4826// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4827static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4828 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4831 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4832 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4833 int M = Mask[I];
4834 if (M < 0)
4835 continue;
4836 SDValue V = (M < NumElts) ? V1 : V2;
4837 if (V.isUndef())
4838 continue;
4839 Ops[I] = V.getOperand(M % NumElts);
4840 }
4841 return DAG.getBuildVector(VT, dl, Ops);
4842 }
4843
4844 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4845}
4846
4847/// Returns a vector_shuffle node for an unpackl operation.
4848static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4849 SDValue V1, SDValue V2) {
4851 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4852 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4853}
4854
4855/// Returns a vector_shuffle node for an unpackh operation.
4856static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4857 SDValue V1, SDValue V2) {
4859 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4860 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4861}
4862
4863/// Returns a node that packs the LHS + RHS nodes together at half width.
4864/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4865/// TODO: Add subvector splitting if/when we have a need for it.
4866static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4867 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4868 bool PackHiHalf = false) {
4869 MVT OpVT = LHS.getSimpleValueType();
4870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4871 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4872 assert(OpVT == RHS.getSimpleValueType() &&
4873 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4874 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4875 "Unexpected PACK operand types");
4876 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4877 "Unexpected PACK result type");
4878
4879 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4880 if (EltSizeInBits == 32) {
4881 SmallVector<int> PackMask;
4882 int Offset = PackHiHalf ? 1 : 0;
4883 int NumElts = VT.getVectorNumElements();
4884 for (int I = 0; I != NumElts; I += 4) {
4885 PackMask.push_back(I + Offset);
4886 PackMask.push_back(I + Offset + 2);
4887 PackMask.push_back(I + Offset + NumElts);
4888 PackMask.push_back(I + Offset + NumElts + 2);
4889 }
4890 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4891 DAG.getBitcast(VT, RHS), PackMask);
4892 }
4893
4894 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4895 if (!PackHiHalf) {
4896 if (UsePackUS &&
4897 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4898 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4899 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4900
4901 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4902 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4903 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4904 }
4905
4906 // Fallback to sign/zero extending the requested half and pack.
4907 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4908 if (UsePackUS) {
4909 if (PackHiHalf) {
4910 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4911 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4912 } else {
4913 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4914 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4915 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4916 };
4917 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4918 };
4919
4920 if (!PackHiHalf) {
4921 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4922 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4923 }
4924 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4925 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4926 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4927}
4928
4929/// Return a vector_shuffle of the specified vector of zero or undef vector.
4930/// This produces a shuffle where the low element of V2 is swizzled into the
4931/// zero/undef vector, landing at element Idx.
4932/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4934 bool IsZero,
4935 const X86Subtarget &Subtarget,
4936 SelectionDAG &DAG) {
4937 MVT VT = V2.getSimpleValueType();
4938 SDValue V1 = IsZero
4939 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4940 int NumElems = VT.getVectorNumElements();
4941 SmallVector<int, 16> MaskVec(NumElems);
4942 for (int i = 0; i != NumElems; ++i)
4943 // If this is the insertion idx, put the low elt of V2 here.
4944 MaskVec[i] = (i == Idx) ? NumElems : i;
4945 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4946}
4947
4949 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4950 Ptr.getOpcode() == X86ISD::WrapperRIP)
4951 Ptr = Ptr.getOperand(0);
4953}
4954
4955// TODO: Add support for non-zero offsets.
4958 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4959 return nullptr;
4960 return CNode->getConstVal();
4961}
4962
4964 if (!Load || !ISD::isNormalLoad(Load))
4965 return nullptr;
4966 return getTargetConstantFromBasePtr(Load->getBasePtr());
4967}
4968
4973
4974const Constant *
4976 assert(LD && "Unexpected null LoadSDNode");
4977 return getTargetConstantFromNode(LD);
4978}
4979
4981 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4982 SDValue Cond = N->getOperand(0);
4983 SDValue RHS = N->getOperand(2);
4984 EVT CondVT = Cond.getValueType();
4985 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4986 CondVT.getVectorElementType() == MVT::i1 &&
4987 ISD::isBuildVectorAllZeros(RHS.getNode());
4988}
4989
4990// Extract raw constant bits from constant pools.
4991static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4992 APInt &UndefElts,
4993 SmallVectorImpl<APInt> &EltBits,
4994 bool AllowWholeUndefs = true,
4995 bool AllowPartialUndefs = false) {
4996 assert(EltBits.empty() && "Expected an empty EltBits vector");
4997
4999
5000 EVT VT = Op.getValueType();
5001 unsigned SizeInBits = VT.getSizeInBits();
5002 unsigned NumElts = SizeInBits / EltSizeInBits;
5003
5004 // Can't split constant.
5005 if ((SizeInBits % EltSizeInBits) != 0)
5006 return false;
5007
5008 // Bitcast a source array of element bits to the target size.
5009 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5010 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5011 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5012 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5013 "Constant bit sizes don't match");
5014
5015 // Don't split if we don't allow undef bits.
5016 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5017 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5018 return false;
5019
5020 // If we're already the right size, don't bother bitcasting.
5021 if (NumSrcElts == NumElts) {
5022 UndefElts = UndefSrcElts;
5023 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5024 return true;
5025 }
5026
5027 // Extract all the undef/constant element data and pack into single bitsets.
5028 APInt UndefBits(SizeInBits, 0);
5029 APInt MaskBits(SizeInBits, 0);
5030
5031 for (unsigned i = 0; i != NumSrcElts; ++i) {
5032 unsigned BitOffset = i * SrcEltSizeInBits;
5033 if (UndefSrcElts[i])
5034 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5035 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5036 }
5037
5038 // Split the undef/constant single bitset data into the target elements.
5039 UndefElts = APInt(NumElts, 0);
5040 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5041
5042 for (unsigned i = 0; i != NumElts; ++i) {
5043 unsigned BitOffset = i * EltSizeInBits;
5044 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5045
5046 // Only treat an element as UNDEF if all bits are UNDEF.
5047 if (UndefEltBits.isAllOnes()) {
5048 if (!AllowWholeUndefs)
5049 return false;
5050 UndefElts.setBit(i);
5051 continue;
5052 }
5053
5054 // If only some bits are UNDEF then treat them as zero (or bail if not
5055 // supported).
5056 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5057 return false;
5058
5059 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5060 }
5061 return true;
5062 };
5063
5064 // Collect constant bits and insert into mask/undef bit masks.
5065 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5066 unsigned UndefBitIndex) {
5067 if (!Cst)
5068 return false;
5069 if (isa<UndefValue>(Cst)) {
5070 Undefs.setBit(UndefBitIndex);
5071 return true;
5072 }
5073 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5074 Mask = CInt->getValue();
5075 return true;
5076 }
5077 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5078 Mask = CFP->getValueAPF().bitcastToAPInt();
5079 return true;
5080 }
5081 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5082 Type *Ty = CDS->getType();
5083 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5084 Type *EltTy = CDS->getElementType();
5085 bool IsInteger = EltTy->isIntegerTy();
5086 bool IsFP =
5087 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5088 if (!IsInteger && !IsFP)
5089 return false;
5090 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5091 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5092 if (IsInteger)
5093 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5094 else
5095 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5096 I * EltBits);
5097 return true;
5098 }
5099 return false;
5100 };
5101
5102 // Handle UNDEFs.
5103 if (Op.isUndef()) {
5104 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5105 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5106 return CastBitData(UndefSrcElts, SrcEltBits);
5107 }
5108
5109 // Extract scalar constant bits.
5110 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5111 APInt UndefSrcElts = APInt::getZero(1);
5112 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5113 return CastBitData(UndefSrcElts, SrcEltBits);
5114 }
5115 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5116 APInt UndefSrcElts = APInt::getZero(1);
5117 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5118 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5119 return CastBitData(UndefSrcElts, SrcEltBits);
5120 }
5121
5122 // Extract constant bits from build vector.
5123 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5124 BitVector Undefs;
5125 SmallVector<APInt> SrcEltBits;
5126 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5127 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5128 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5129 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5130 if (Undefs[I])
5131 UndefSrcElts.setBit(I);
5132 return CastBitData(UndefSrcElts, SrcEltBits);
5133 }
5134 }
5135
5136 // Extract constant bits from constant pool vector.
5137 if (auto *Cst = getTargetConstantFromNode(Op)) {
5138 Type *CstTy = Cst->getType();
5139 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5140 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5141 return false;
5142
5143 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5144 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5145 if ((SizeInBits % SrcEltSizeInBits) != 0)
5146 return false;
5147
5148 APInt UndefSrcElts(NumSrcElts, 0);
5149 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5150 for (unsigned i = 0; i != NumSrcElts; ++i)
5151 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5152 UndefSrcElts, i))
5153 return false;
5154
5155 return CastBitData(UndefSrcElts, SrcEltBits);
5156 }
5157
5158 // Extract constant bits from a broadcasted constant pool scalar.
5159 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5160 EltSizeInBits <= VT.getScalarSizeInBits()) {
5161 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5162 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5163 return false;
5164
5165 SDValue Ptr = MemIntr->getBasePtr();
5167 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5168 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5169
5170 APInt UndefSrcElts(NumSrcElts, 0);
5171 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5172 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5173 if (UndefSrcElts[0])
5174 UndefSrcElts.setBits(0, NumSrcElts);
5175 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5176 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5177 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5178 return CastBitData(UndefSrcElts, SrcEltBits);
5179 }
5180 }
5181 }
5182
5183 // Extract constant bits from a subvector broadcast.
5184 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5185 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5186 SDValue Ptr = MemIntr->getBasePtr();
5187 // The source constant may be larger than the subvector broadcast,
5188 // ensure we extract the correct subvector constants.
5189 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5190 Type *CstTy = Cst->getType();
5191 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5192 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5193 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5194 (SizeInBits % SubVecSizeInBits) != 0)
5195 return false;
5196 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5197 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5198 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5199 APInt UndefSubElts(NumSubElts, 0);
5200 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5201 APInt(CstEltSizeInBits, 0));
5202 for (unsigned i = 0; i != NumSubElts; ++i) {
5203 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5204 UndefSubElts, i))
5205 return false;
5206 for (unsigned j = 1; j != NumSubVecs; ++j)
5207 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5208 }
5209 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5210 UndefSubElts);
5211 return CastBitData(UndefSubElts, SubEltBits);
5212 }
5213 }
5214
5215 // Extract a rematerialized scalar constant insertion.
5216 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5217 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5218 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5219 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5220 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5221
5222 APInt UndefSrcElts(NumSrcElts, 0);
5223 SmallVector<APInt, 64> SrcEltBits;
5224 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5225 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5226 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5227 return CastBitData(UndefSrcElts, SrcEltBits);
5228 }
5229
5230 // Insert constant bits from a base and sub vector sources.
5231 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5232 // If bitcasts to larger elements we might lose track of undefs - don't
5233 // allow any to be safe.
5234 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5235 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5236
5237 APInt UndefSrcElts, UndefSubElts;
5238 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5239 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5240 UndefSubElts, EltSubBits,
5241 AllowWholeUndefs && AllowUndefs,
5242 AllowPartialUndefs && AllowUndefs) &&
5243 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5244 UndefSrcElts, EltSrcBits,
5245 AllowWholeUndefs && AllowUndefs,
5246 AllowPartialUndefs && AllowUndefs)) {
5247 unsigned BaseIdx = Op.getConstantOperandVal(2);
5248 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5249 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5250 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5251 return CastBitData(UndefSrcElts, EltSrcBits);
5252 }
5253 }
5254
5255 // Extract constant bits from a subvector's source.
5256 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5257 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5258 EltBits, AllowWholeUndefs,
5259 AllowPartialUndefs)) {
5260 EVT SrcVT = Op.getOperand(0).getValueType();
5261 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5262 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5263 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5264 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5265 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5266 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5267 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5268
5269 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5270 if ((BaseIdx + NumSubElts) != NumSrcElts)
5271 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5272 if (BaseIdx != 0)
5273 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5274 return true;
5275 }
5276
5277 // Extract constant bits from shuffle node sources.
5278 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5279 // TODO - support shuffle through bitcasts.
5280 if (EltSizeInBits != VT.getScalarSizeInBits())
5281 return false;
5282
5283 ArrayRef<int> Mask = SVN->getMask();
5284 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5285 llvm::any_of(Mask, [](int M) { return M < 0; }))
5286 return false;
5287
5288 APInt UndefElts0, UndefElts1;
5289 SmallVector<APInt, 32> EltBits0, EltBits1;
5290 if (isAnyInRange(Mask, 0, NumElts) &&
5291 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5292 UndefElts0, EltBits0, AllowWholeUndefs,
5293 AllowPartialUndefs))
5294 return false;
5295 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5296 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5297 UndefElts1, EltBits1, AllowWholeUndefs,
5298 AllowPartialUndefs))
5299 return false;
5300
5301 UndefElts = APInt::getZero(NumElts);
5302 for (int i = 0; i != (int)NumElts; ++i) {
5303 int M = Mask[i];
5304 if (M < 0) {
5305 UndefElts.setBit(i);
5306 EltBits.push_back(APInt::getZero(EltSizeInBits));
5307 } else if (M < (int)NumElts) {
5308 if (UndefElts0[M])
5309 UndefElts.setBit(i);
5310 EltBits.push_back(EltBits0[M]);
5311 } else {
5312 if (UndefElts1[M - NumElts])
5313 UndefElts.setBit(i);
5314 EltBits.push_back(EltBits1[M - NumElts]);
5315 }
5316 }
5317 return true;
5318 }
5319
5320 return false;
5321}
5322
5323namespace llvm {
5324namespace X86 {
5325bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5326 APInt UndefElts;
5327 SmallVector<APInt, 16> EltBits;
5329 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5330 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5331 int SplatIndex = -1;
5332 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5333 if (UndefElts[i])
5334 continue;
5335 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5336 SplatIndex = -1;
5337 break;
5338 }
5339 SplatIndex = i;
5340 }
5341 if (0 <= SplatIndex) {
5342 SplatVal = EltBits[SplatIndex];
5343 return true;
5344 }
5345 }
5346
5347 return false;
5348}
5349} // namespace X86
5350} // namespace llvm
5351
5353 unsigned MaskEltSizeInBits,
5355 APInt &UndefElts) {
5356 // Extract the raw target constant bits.
5357 SmallVector<APInt, 64> EltBits;
5358 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5359 EltBits, /* AllowWholeUndefs */ true,
5360 /* AllowPartialUndefs */ false))
5361 return false;
5362
5363 // Insert the extracted elements into the mask.
5364 for (const APInt &Elt : EltBits)
5365 RawMask.push_back(Elt.getZExtValue());
5366
5367 return true;
5368}
5369
5370static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5371 bool AllowUndefs) {
5372 APInt UndefElts;
5373 SmallVector<APInt, 64> EltBits;
5374 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5375 /*AllowWholeUndefs*/ AllowUndefs,
5376 /*AllowPartialUndefs*/ false))
5377 return false;
5378
5379 bool IsPow2OrUndef = true;
5380 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5381 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5382 return IsPow2OrUndef;
5383}
5384
5385// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5387 // TODO: don't always ignore oneuse constraints.
5388 V = peekThroughBitcasts(V);
5389 EVT VT = V.getValueType();
5390
5391 // Match not(xor X, -1) -> X.
5392 if (V.getOpcode() == ISD::XOR &&
5393 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5394 isAllOnesConstant(V.getOperand(1))))
5395 return V.getOperand(0);
5396
5397 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5398 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5399 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5400 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5401 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5403 V.getOperand(1));
5404 }
5405 }
5406
5407 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5408 if (V.getOpcode() == X86ISD::PCMPGT &&
5409 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5410 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5411 V.getOperand(0).hasOneUse()) {
5412 APInt UndefElts;
5413 SmallVector<APInt> EltBits;
5414 if (getTargetConstantBitsFromNode(V.getOperand(0),
5415 V.getScalarValueSizeInBits(), UndefElts,
5416 EltBits) &&
5417 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5418 // Don't fold min_signed_value -> (min_signed_value - 1)
5419 bool MinSigned = false;
5420 for (APInt &Elt : EltBits) {
5421 MinSigned |= Elt.isMinSignedValue();
5422 Elt -= 1;
5423 }
5424 if (!MinSigned) {
5425 SDLoc DL(V);
5426 MVT VT = V.getSimpleValueType();
5427 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5428 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5429 }
5430 }
5431 }
5432
5433 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5435 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5436 for (SDValue &CatOp : CatOps) {
5437 SDValue NotCat = IsNOT(CatOp, DAG);
5438 if (!NotCat)
5439 return SDValue();
5440 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5441 }
5442 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5443 }
5444
5445 // Match not(or(not(X),not(Y))) -> and(X, Y).
5446 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5447 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5448 // TODO: Handle cases with single NOT operand -> ANDNP
5449 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5450 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5451 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5452 DAG.getBitcast(VT, Op1));
5453 }
5454
5455 return SDValue();
5456}
5457
5458/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5459/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5460/// Note: This ignores saturation, so inputs must be checked first.
5462 bool Unary, unsigned NumStages = 1) {
5463 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5464 unsigned NumElts = VT.getVectorNumElements();
5465 unsigned NumLanes = VT.getSizeInBits() / 128;
5466 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5467 unsigned Offset = Unary ? 0 : NumElts;
5468 unsigned Repetitions = 1u << (NumStages - 1);
5469 unsigned Increment = 1u << NumStages;
5470 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5471
5472 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5473 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5474 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5475 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5476 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5477 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5478 }
5479 }
5480}
5481
5482// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5483static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5484 APInt &DemandedLHS, APInt &DemandedRHS) {
5485 int NumLanes = VT.getSizeInBits() / 128;
5486 int NumElts = DemandedElts.getBitWidth();
5487 int NumInnerElts = NumElts / 2;
5488 int NumEltsPerLane = NumElts / NumLanes;
5489 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5490
5491 DemandedLHS = APInt::getZero(NumInnerElts);
5492 DemandedRHS = APInt::getZero(NumInnerElts);
5493
5494 // Map DemandedElts to the packed operands.
5495 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5496 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5497 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5498 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5499 if (DemandedElts[OuterIdx])
5500 DemandedLHS.setBit(InnerIdx);
5501 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5502 DemandedRHS.setBit(InnerIdx);
5503 }
5504 }
5505}
5506
5507// Split the demanded elts of a HADD/HSUB node between its operands.
5508static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5509 APInt &DemandedLHS, APInt &DemandedRHS) {
5511 DemandedLHS, DemandedRHS);
5512 DemandedLHS |= DemandedLHS << 1;
5513 DemandedRHS |= DemandedRHS << 1;
5514}
5515
5516/// Calculates the shuffle mask corresponding to the target-specific opcode.
5517/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5518/// operands in \p Ops, and returns true.
5519/// Sets \p IsUnary to true if only one source is used. Note that this will set
5520/// IsUnary for shuffles which use a single input multiple times, and in those
5521/// cases it will adjust the mask to only have indices within that single input.
5522/// It is an error to call this with non-empty Mask/Ops vectors.
5523static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5525 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5526 if (!isTargetShuffle(N.getOpcode()))
5527 return false;
5528
5529 MVT VT = N.getSimpleValueType();
5530 unsigned NumElems = VT.getVectorNumElements();
5531 unsigned MaskEltSize = VT.getScalarSizeInBits();
5533 APInt RawUndefs;
5534 uint64_t ImmN;
5535
5536 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5537 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5538
5539 IsUnary = false;
5540 bool IsFakeUnary = false;
5541 switch (N.getOpcode()) {
5542 case X86ISD::BLENDI:
5543 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5544 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5545 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5546 DecodeBLENDMask(NumElems, ImmN, Mask);
5547 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5548 break;
5549 case X86ISD::SHUFP:
5550 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5551 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5552 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5553 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5554 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5555 break;
5556 case X86ISD::INSERTPS:
5557 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5558 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5559 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5560 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5561 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5562 break;
5563 case X86ISD::EXTRQI:
5564 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5565 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5566 isa<ConstantSDNode>(N.getOperand(2))) {
5567 int BitLen = N.getConstantOperandVal(1);
5568 int BitIdx = N.getConstantOperandVal(2);
5569 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5570 IsUnary = true;
5571 }
5572 break;
5573 case X86ISD::INSERTQI:
5574 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5575 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5576 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5577 isa<ConstantSDNode>(N.getOperand(3))) {
5578 int BitLen = N.getConstantOperandVal(2);
5579 int BitIdx = N.getConstantOperandVal(3);
5580 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5581 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5582 }
5583 break;
5584 case X86ISD::UNPCKH:
5585 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5586 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5587 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5588 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5589 break;
5590 case X86ISD::UNPCKL:
5591 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5592 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5593 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5594 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5595 break;
5596 case X86ISD::MOVHLPS:
5597 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5598 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5599 DecodeMOVHLPSMask(NumElems, Mask);
5600 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5601 break;
5602 case X86ISD::MOVLHPS:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5605 DecodeMOVLHPSMask(NumElems, Mask);
5606 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5607 break;
5608 case X86ISD::VALIGN:
5609 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5610 "Only 32-bit and 64-bit elements are supported!");
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5614 DecodeVALIGNMask(NumElems, ImmN, Mask);
5615 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5616 Ops.push_back(N.getOperand(1));
5617 Ops.push_back(N.getOperand(0));
5618 break;
5619 case X86ISD::PALIGNR:
5620 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5621 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5622 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5623 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5624 DecodePALIGNRMask(NumElems, ImmN, Mask);
5625 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5626 Ops.push_back(N.getOperand(1));
5627 Ops.push_back(N.getOperand(0));
5628 break;
5629 case X86ISD::VSHLDQ:
5630 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5633 DecodePSLLDQMask(NumElems, ImmN, Mask);
5634 IsUnary = true;
5635 break;
5636 case X86ISD::VSRLDQ:
5637 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5638 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5639 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5640 DecodePSRLDQMask(NumElems, ImmN, Mask);
5641 IsUnary = true;
5642 break;
5643 case X86ISD::PSHUFD:
5644 case X86ISD::VPERMILPI:
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5647 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5648 IsUnary = true;
5649 break;
5650 case X86ISD::PSHUFHW:
5651 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5653 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5654 IsUnary = true;
5655 break;
5656 case X86ISD::PSHUFLW:
5657 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5658 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5659 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5660 IsUnary = true;
5661 break;
5662 case X86ISD::VZEXT_MOVL:
5663 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5664 DecodeZeroMoveLowMask(NumElems, Mask);
5665 IsUnary = true;
5666 break;
5667 case X86ISD::VBROADCAST:
5668 // We only decode broadcasts of same-sized vectors, peeking through to
5669 // extracted subvectors is likely to cause hasOneUse issues with
5670 // SimplifyDemandedBits etc.
5671 if (N.getOperand(0).getValueType() == VT) {
5672 DecodeVectorBroadcast(NumElems, Mask);
5673 IsUnary = true;
5674 break;
5675 }
5676 return false;
5677 case X86ISD::VPERMILPV: {
5678 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5679 IsUnary = true;
5680 SDValue MaskNode = N.getOperand(1);
5681 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5682 RawUndefs)) {
5683 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5684 break;
5685 }
5686 return false;
5687 }
5688 case X86ISD::PSHUFB: {
5689 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5690 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5691 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5692 IsUnary = true;
5693 SDValue MaskNode = N.getOperand(1);
5694 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5695 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 case X86ISD::VPERMI:
5701 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5702 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5703 DecodeVPERMMask(NumElems, ImmN, Mask);
5704 IsUnary = true;
5705 break;
5706 case X86ISD::MOVSS:
5707 case X86ISD::MOVSD:
5708 case X86ISD::MOVSH:
5709 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5710 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5711 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5712 break;
5713 case X86ISD::VPERM2X128:
5714 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5715 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5716 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5717 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5718 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5719 break;
5720 case X86ISD::SHUF128:
5721 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5722 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5723 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5724 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5725 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5726 break;
5727 case X86ISD::MOVSLDUP:
5728 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5729 DecodeMOVSLDUPMask(NumElems, Mask);
5730 IsUnary = true;
5731 break;
5732 case X86ISD::MOVSHDUP:
5733 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5734 DecodeMOVSHDUPMask(NumElems, Mask);
5735 IsUnary = true;
5736 break;
5737 case X86ISD::MOVDDUP:
5738 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5739 DecodeMOVDDUPMask(NumElems, Mask);
5740 IsUnary = true;
5741 break;
5742 case X86ISD::VPERMIL2: {
5743 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5744 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5745 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5746 SDValue MaskNode = N.getOperand(2);
5747 SDValue CtrlNode = N.getOperand(3);
5748 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5749 unsigned CtrlImm = CtrlOp->getZExtValue();
5750 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5751 RawUndefs)) {
5752 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5753 Mask);
5754 break;
5755 }
5756 }
5757 return false;
5758 }
5759 case X86ISD::VPPERM: {
5760 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5761 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5762 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5763 SDValue MaskNode = N.getOperand(2);
5764 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5765 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5766 break;
5767 }
5768 return false;
5769 }
5770 case X86ISD::VPERMV: {
5771 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5772 IsUnary = true;
5773 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5774 Ops.push_back(N.getOperand(1));
5775 SDValue MaskNode = N.getOperand(0);
5776 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5777 RawUndefs)) {
5778 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5779 break;
5780 }
5781 return false;
5782 }
5783 case X86ISD::VPERMV3: {
5784 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5785 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5786 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5787 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5788 Ops.push_back(N.getOperand(0));
5789 Ops.push_back(N.getOperand(2));
5790 SDValue MaskNode = N.getOperand(1);
5791 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5792 RawUndefs)) {
5793 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5794 break;
5795 }
5796 return false;
5797 }
5798 default:
5799 llvm_unreachable("unknown target shuffle node");
5800 }
5801
5802 // Empty mask indicates the decode failed.
5803 if (Mask.empty())
5804 return false;
5805
5806 // Check if we're getting a shuffle mask with zero'd elements.
5807 if (!AllowSentinelZero && isAnyZero(Mask))
5808 return false;
5809
5810 // If we have a fake unary shuffle, the shuffle mask is spread across two
5811 // inputs that are actually the same node. Re-map the mask to always point
5812 // into the first input.
5813 if (IsFakeUnary)
5814 for (int &M : Mask)
5815 if (M >= (int)Mask.size())
5816 M -= Mask.size();
5817
5818 // If we didn't already add operands in the opcode-specific code, default to
5819 // adding 1 or 2 operands starting at 0.
5820 if (Ops.empty()) {
5821 Ops.push_back(N.getOperand(0));
5822 if (!IsUnary || IsFakeUnary)
5823 Ops.push_back(N.getOperand(1));
5824 }
5825
5826 return true;
5827}
5828
5829// Wrapper for getTargetShuffleMask with InUnary;
5830static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5832 SmallVectorImpl<int> &Mask) {
5833 bool IsUnary;
5834 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5835}
5836
5837/// Compute whether each element of a shuffle is zeroable.
5838///
5839/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5840/// Either it is an undef element in the shuffle mask, the element of the input
5841/// referenced is undef, or the element of the input referenced is known to be
5842/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5843/// as many lanes with this technique as possible to simplify the remaining
5844/// shuffle.
5846 SDValue V1, SDValue V2,
5847 APInt &KnownUndef, APInt &KnownZero) {
5848 int Size = Mask.size();
5849 KnownUndef = KnownZero = APInt::getZero(Size);
5850
5851 V1 = peekThroughBitcasts(V1);
5852 V2 = peekThroughBitcasts(V2);
5853
5854 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5855 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5856
5857 int VectorSizeInBits = V1.getValueSizeInBits();
5858 int ScalarSizeInBits = VectorSizeInBits / Size;
5859 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5860
5861 for (int i = 0; i < Size; ++i) {
5862 int M = Mask[i];
5863 // Handle the easy cases.
5864 if (M < 0) {
5865 KnownUndef.setBit(i);
5866 continue;
5867 }
5868 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5869 KnownZero.setBit(i);
5870 continue;
5871 }
5872
5873 // Determine shuffle input and normalize the mask.
5874 SDValue V = M < Size ? V1 : V2;
5875 M %= Size;
5876
5877 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5878 if (V.getOpcode() != ISD::BUILD_VECTOR)
5879 continue;
5880
5881 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5882 // the (larger) source element must be UNDEF/ZERO.
5883 if ((Size % V.getNumOperands()) == 0) {
5884 int Scale = Size / V->getNumOperands();
5885 SDValue Op = V.getOperand(M / Scale);
5886 if (Op.isUndef())
5887 KnownUndef.setBit(i);
5888 if (X86::isZeroNode(Op))
5889 KnownZero.setBit(i);
5890 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5891 APInt Val = Cst->getAPIntValue();
5892 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5893 if (Val == 0)
5894 KnownZero.setBit(i);
5895 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5896 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5897 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5898 if (Val == 0)
5899 KnownZero.setBit(i);
5900 }
5901 continue;
5902 }
5903
5904 // If the BUILD_VECTOR has more elements then all the (smaller) source
5905 // elements must be UNDEF or ZERO.
5906 if ((V.getNumOperands() % Size) == 0) {
5907 int Scale = V->getNumOperands() / Size;
5908 bool AllUndef = true;
5909 bool AllZero = true;
5910 for (int j = 0; j < Scale; ++j) {
5911 SDValue Op = V.getOperand((M * Scale) + j);
5912 AllUndef &= Op.isUndef();
5913 AllZero &= X86::isZeroNode(Op);
5914 }
5915 if (AllUndef)
5916 KnownUndef.setBit(i);
5917 if (AllZero)
5918 KnownZero.setBit(i);
5919 continue;
5920 }
5921 }
5922}
5923
5924/// Decode a target shuffle mask and inputs and see if any values are
5925/// known to be undef or zero from their inputs.
5926/// Returns true if the target shuffle mask was decoded.
5927/// FIXME: Merge this with computeZeroableShuffleElements?
5930 APInt &KnownUndef, APInt &KnownZero) {
5931 bool IsUnary;
5932 if (!isTargetShuffle(N.getOpcode()))
5933 return false;
5934
5935 MVT VT = N.getSimpleValueType();
5936 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5937 return false;
5938
5939 int Size = Mask.size();
5940 SDValue V1 = Ops[0];
5941 SDValue V2 = IsUnary ? V1 : Ops[1];
5942 KnownUndef = KnownZero = APInt::getZero(Size);
5943
5944 V1 = peekThroughBitcasts(V1);
5945 V2 = peekThroughBitcasts(V2);
5946
5947 assert((VT.getSizeInBits() % Size) == 0 &&
5948 "Illegal split of shuffle value type");
5949 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5950
5951 // Extract known constant input data.
5952 APInt UndefSrcElts[2];
5953 SmallVector<APInt, 32> SrcEltBits[2];
5954 bool IsSrcConstant[2] = {
5955 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5956 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5957 /*AllowPartialUndefs*/ false),
5958 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5959 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5960 /*AllowPartialUndefs*/ false)};
5961
5962 for (int i = 0; i < Size; ++i) {
5963 int M = Mask[i];
5964
5965 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5966 if (M < 0) {
5967 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5968 if (SM_SentinelUndef == M)
5969 KnownUndef.setBit(i);
5970 if (SM_SentinelZero == M)
5971 KnownZero.setBit(i);
5972 continue;
5973 }
5974
5975 // Determine shuffle input and normalize the mask.
5976 unsigned SrcIdx = M / Size;
5977 SDValue V = M < Size ? V1 : V2;
5978 M %= Size;
5979
5980 // We are referencing an UNDEF input.
5981 if (V.isUndef()) {
5982 KnownUndef.setBit(i);
5983 continue;
5984 }
5985
5986 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5987 // TODO: We currently only set UNDEF for integer types - floats use the same
5988 // registers as vectors and many of the scalar folded loads rely on the
5989 // SCALAR_TO_VECTOR pattern.
5990 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5991 (Size % V.getValueType().getVectorNumElements()) == 0) {
5992 int Scale = Size / V.getValueType().getVectorNumElements();
5993 int Idx = M / Scale;
5994 if (Idx != 0 && !VT.isFloatingPoint())
5995 KnownUndef.setBit(i);
5996 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5997 KnownZero.setBit(i);
5998 continue;
5999 }
6000
6001 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6002 // base vectors.
6003 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6004 SDValue Vec = V.getOperand(0);
6005 int NumVecElts = Vec.getValueType().getVectorNumElements();
6006 if (Vec.isUndef() && Size == NumVecElts) {
6007 int Idx = V.getConstantOperandVal(2);
6008 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6009 if (M < Idx || (Idx + NumSubElts) <= M)
6010 KnownUndef.setBit(i);
6011 }
6012 continue;
6013 }
6014
6015 // Attempt to extract from the source's constant bits.
6016 if (IsSrcConstant[SrcIdx]) {
6017 if (UndefSrcElts[SrcIdx][M])
6018 KnownUndef.setBit(i);
6019 else if (SrcEltBits[SrcIdx][M] == 0)
6020 KnownZero.setBit(i);
6021 }
6022 }
6023
6024 assert(VT.getVectorNumElements() == (unsigned)Size &&
6025 "Different mask size from vector size!");
6026 return true;
6027}
6028
6029// Replace target shuffle mask elements with known undef/zero sentinels.
6031 const APInt &KnownUndef,
6032 const APInt &KnownZero,
6033 bool ResolveKnownZeros= true) {
6034 unsigned NumElts = Mask.size();
6035 assert(KnownUndef.getBitWidth() == NumElts &&
6036 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6037
6038 for (unsigned i = 0; i != NumElts; ++i) {
6039 if (KnownUndef[i])
6040 Mask[i] = SM_SentinelUndef;
6041 else if (ResolveKnownZeros && KnownZero[i])
6042 Mask[i] = SM_SentinelZero;
6043 }
6044}
6045
6046// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6048 APInt &KnownUndef,
6049 APInt &KnownZero) {
6050 unsigned NumElts = Mask.size();
6051 KnownUndef = KnownZero = APInt::getZero(NumElts);
6052
6053 for (unsigned i = 0; i != NumElts; ++i) {
6054 int M = Mask[i];
6055 if (SM_SentinelUndef == M)
6056 KnownUndef.setBit(i);
6057 if (SM_SentinelZero == M)
6058 KnownZero.setBit(i);
6059 }
6060}
6061
6062// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6064 SDValue Cond, bool IsBLENDV = false) {
6065 EVT CondVT = Cond.getValueType();
6066 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6067 unsigned NumElts = CondVT.getVectorNumElements();
6068
6069 APInt UndefElts;
6070 SmallVector<APInt, 32> EltBits;
6071 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6072 /*AllowWholeUndefs*/ true,
6073 /*AllowPartialUndefs*/ false))
6074 return false;
6075
6076 Mask.resize(NumElts, SM_SentinelUndef);
6077
6078 for (int i = 0; i != (int)NumElts; ++i) {
6079 Mask[i] = i;
6080 // Arbitrarily choose from the 2nd operand if the select condition element
6081 // is undef.
6082 // TODO: Can we do better by matching patterns such as even/odd?
6083 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6084 (IsBLENDV && EltBits[i].isNonNegative()))
6085 Mask[i] += NumElts;
6086 }
6087
6088 return true;
6089}
6090
6091// Forward declaration (for getFauxShuffleMask recursive check).
6092static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6095 const SelectionDAG &DAG, unsigned Depth,
6096 bool ResolveKnownElts);
6097
6098// Attempt to decode ops that could be represented as a shuffle mask.
6099// The decoded shuffle mask may contain a different number of elements to the
6100// destination value type.
6101// TODO: Merge into getTargetShuffleInputs()
6102static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6105 const SelectionDAG &DAG, unsigned Depth,
6106 bool ResolveKnownElts) {
6107 Mask.clear();
6108 Ops.clear();
6109
6110 MVT VT = N.getSimpleValueType();
6111 unsigned NumElts = VT.getVectorNumElements();
6112 unsigned NumSizeInBits = VT.getSizeInBits();
6113 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6114 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6115 return false;
6116 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6117 unsigned NumSizeInBytes = NumSizeInBits / 8;
6118 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6119
6120 unsigned Opcode = N.getOpcode();
6121 switch (Opcode) {
6122 case ISD::VECTOR_SHUFFLE: {
6123 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6124 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6125 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6126 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6127 Ops.push_back(N.getOperand(0));
6128 Ops.push_back(N.getOperand(1));
6129 return true;
6130 }
6131 return false;
6132 }
6133 case ISD::AND:
6134 case X86ISD::ANDNP: {
6135 // Attempt to decode as a per-byte mask.
6136 APInt UndefElts;
6137 SmallVector<APInt, 32> EltBits;
6138 SDValue N0 = N.getOperand(0);
6139 SDValue N1 = N.getOperand(1);
6140 bool IsAndN = (X86ISD::ANDNP == Opcode);
6141 uint64_t ZeroMask = IsAndN ? 255 : 0;
6142 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6143 /*AllowWholeUndefs*/ false,
6144 /*AllowPartialUndefs*/ false))
6145 return false;
6146 // We can't assume an undef src element gives an undef dst - the other src
6147 // might be zero.
6148 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6149 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6150 const APInt &ByteBits = EltBits[i];
6151 if (ByteBits != 0 && ByteBits != 255)
6152 return false;
6153 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6154 }
6155 Ops.push_back(IsAndN ? N1 : N0);
6156 return true;
6157 }
6158 case ISD::OR: {
6159 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6160 // is a valid shuffle index.
6161 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6162 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6163 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6164 return false;
6165
6166 SmallVector<int, 64> SrcMask0, SrcMask1;
6167 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6170 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6171 Depth + 1, true) ||
6172 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6173 Depth + 1, true))
6174 return false;
6175
6176 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6177 SmallVector<int, 64> Mask0, Mask1;
6178 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6179 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6180 for (int i = 0; i != (int)MaskSize; ++i) {
6181 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6182 // loops converting between OR and BLEND shuffles due to
6183 // canWidenShuffleElements merging away undef elements, meaning we
6184 // fail to recognise the OR as the undef element isn't known zero.
6185 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6186 Mask.push_back(SM_SentinelZero);
6187 else if (Mask1[i] == SM_SentinelZero)
6188 Mask.push_back(i);
6189 else if (Mask0[i] == SM_SentinelZero)
6190 Mask.push_back(i + MaskSize);
6191 else
6192 return false;
6193 }
6194 Ops.push_back(N.getOperand(0));
6195 Ops.push_back(N.getOperand(1));
6196 return true;
6197 }
6198 case ISD::CONCAT_VECTORS: {
6199 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6200 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6201 if (NumBitsPerElt == 64) {
6202 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6203 for (unsigned M = 0; M != NumSubElts; ++M)
6204 Mask.push_back((I * NumElts) + M);
6205 Ops.push_back(N.getOperand(I));
6206 }
6207 return true;
6208 }
6209 return false;
6210 }
6211 case ISD::INSERT_SUBVECTOR: {
6212 SDValue Src = N.getOperand(0);
6213 SDValue Sub = N.getOperand(1);
6214 EVT SubVT = Sub.getValueType();
6215 unsigned NumSubElts = SubVT.getVectorNumElements();
6216 uint64_t InsertIdx = N.getConstantOperandVal(2);
6217 // Subvector isn't demanded - just return the base vector.
6218 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6219 Mask.resize(NumElts);
6220 std::iota(Mask.begin(), Mask.end(), 0);
6221 Ops.push_back(Src);
6222 return true;
6223 }
6224 // Handle CONCAT(SUB0, SUB1).
6225 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6226 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6227 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6228 Src.getOperand(0).isUndef() &&
6229 Src.getOperand(1).getValueType() == SubVT &&
6230 Src.getConstantOperandVal(2) == 0 &&
6231 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6232 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6233 Mask.resize(NumElts);
6234 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6235 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6236 Ops.push_back(Src.getOperand(1));
6237 Ops.push_back(Sub);
6238 return true;
6239 }
6240 if (!N->isOnlyUserOf(Sub.getNode()))
6241 return false;
6242
6243 SmallVector<int, 64> SubMask;
6244 SmallVector<SDValue, 2> SubInputs;
6246 EVT SubSrcVT = SubSrc.getValueType();
6247 if (!SubSrcVT.isVector())
6248 return false;
6249
6250 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6251 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6252 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6253 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6254 SDValue SubSrcSrc = SubSrc.getOperand(0);
6255 unsigned NumSubSrcSrcElts =
6256 SubSrcSrc.getValueType().getVectorNumElements();
6257 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6258 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6259 "Subvector valuetype mismatch");
6260 InsertIdx *= (MaxElts / NumElts);
6261 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6262 NumSubElts *= (MaxElts / NumElts);
6263 bool SrcIsUndef = Src.isUndef();
6264 for (int i = 0; i != (int)MaxElts; ++i)
6265 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6266 for (int i = 0; i != (int)NumSubElts; ++i)
6267 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6268 if (!SrcIsUndef)
6269 Ops.push_back(Src);
6270 Ops.push_back(SubSrcSrc);
6271 return true;
6272 }
6273
6274 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6275 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6276 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6277 Depth + 1, ResolveKnownElts))
6278 return false;
6279
6280 // Subvector shuffle inputs must not be larger than the subvector.
6281 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6282 return SubVT.getFixedSizeInBits() <
6283 SubInput.getValueSizeInBits().getFixedValue();
6284 }))
6285 return false;
6286
6287 if (SubMask.size() != NumSubElts) {
6288 assert(((SubMask.size() % NumSubElts) == 0 ||
6289 (NumSubElts % SubMask.size()) == 0) &&
6290 "Illegal submask scale");
6291 if ((NumSubElts % SubMask.size()) == 0) {
6292 int Scale = NumSubElts / SubMask.size();
6293 SmallVector<int, 64> ScaledSubMask;
6294 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6295 SubMask = ScaledSubMask;
6296 } else {
6297 int Scale = SubMask.size() / NumSubElts;
6298 NumSubElts = SubMask.size();
6299 NumElts *= Scale;
6300 InsertIdx *= Scale;
6301 }
6302 }
6303 Ops.push_back(Src);
6304 Ops.append(SubInputs.begin(), SubInputs.end());
6305 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6306 Mask.append(NumElts, SM_SentinelZero);
6307 else
6308 for (int i = 0; i != (int)NumElts; ++i)
6309 Mask.push_back(i);
6310 for (int i = 0; i != (int)NumSubElts; ++i) {
6311 int M = SubMask[i];
6312 if (0 <= M) {
6313 int InputIdx = M / NumSubElts;
6314 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6315 }
6316 Mask[i + InsertIdx] = M;
6317 }
6318 return true;
6319 }
6320 case X86ISD::PINSRB:
6321 case X86ISD::PINSRW:
6324 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6325 // vector, for matching src/dst vector types.
6326 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6327
6328 unsigned DstIdx = 0;
6329 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6330 // Check we have an in-range constant insertion index.
6331 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6332 N.getConstantOperandAPInt(2).uge(NumElts))
6333 return false;
6334 DstIdx = N.getConstantOperandVal(2);
6335
6336 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6337 if (X86::isZeroNode(Scl)) {
6338 Ops.push_back(N.getOperand(0));
6339 for (unsigned i = 0; i != NumElts; ++i)
6340 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6341 return true;
6342 }
6343 }
6344
6345 // Peek through trunc/aext/zext/bitcast.
6346 // TODO: aext shouldn't require SM_SentinelZero padding.
6347 // TODO: handle shift of scalars.
6348 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6349 while (Scl.getOpcode() == ISD::TRUNCATE ||
6350 Scl.getOpcode() == ISD::ANY_EXTEND ||
6351 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6352 (Scl.getOpcode() == ISD::BITCAST &&
6355 Scl = Scl.getOperand(0);
6356 MinBitsPerElt =
6357 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6358 }
6359 if ((MinBitsPerElt % 8) != 0)
6360 return false;
6361
6362 // Attempt to find the source vector the scalar was extracted from.
6363 SDValue SrcExtract;
6364 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6365 Scl.getOpcode() == X86ISD::PEXTRW ||
6366 Scl.getOpcode() == X86ISD::PEXTRB) &&
6367 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6368 SrcExtract = Scl;
6369 }
6370 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6371 return false;
6372
6373 SDValue SrcVec = SrcExtract.getOperand(0);
6374 EVT SrcVT = SrcVec.getValueType();
6375 if (!SrcVT.getScalarType().isByteSized())
6376 return false;
6377 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6378 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6379 unsigned DstByte = DstIdx * NumBytesPerElt;
6380 MinBitsPerElt =
6381 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6382
6383 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6384 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6385 Ops.push_back(SrcVec);
6386 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6387 } else {
6388 Ops.push_back(SrcVec);
6389 Ops.push_back(N.getOperand(0));
6390 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6391 Mask.push_back(NumSizeInBytes + i);
6392 }
6393
6394 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6395 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6396 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6397 Mask[DstByte + i] = SrcByte + i;
6398 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6399 Mask[DstByte + i] = SM_SentinelZero;
6400 return true;
6401 }
6402 case X86ISD::PACKSS:
6403 case X86ISD::PACKUS: {
6404 SDValue N0 = N.getOperand(0);
6405 SDValue N1 = N.getOperand(1);
6406 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6407 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6408 "Unexpected input value type");
6409
6410 APInt EltsLHS, EltsRHS;
6411 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6412
6413 // If we know input saturation won't happen (or we don't care for particular
6414 // lanes), we can treat this as a truncation shuffle.
6415 bool Offset0 = false, Offset1 = false;
6416 if (Opcode == X86ISD::PACKSS) {
6417 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6418 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6419 (!(N1.isUndef() || EltsRHS.isZero()) &&
6420 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6421 return false;
6422 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6423 // PACKSS then it was likely being used for sign-extension for a
6424 // truncation, so just peek through and adjust the mask accordingly.
6425 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6426 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6427 Offset0 = true;
6428 N0 = N0.getOperand(0);
6429 }
6430 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6431 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6432 Offset1 = true;
6433 N1 = N1.getOperand(0);
6434 }
6435 } else {
6436 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6437 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6438 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6439 (!(N1.isUndef() || EltsRHS.isZero()) &&
6440 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6441 return false;
6442 }
6443
6444 bool IsUnary = (N0 == N1);
6445
6446 Ops.push_back(N0);
6447 if (!IsUnary)
6448 Ops.push_back(N1);
6449
6450 createPackShuffleMask(VT, Mask, IsUnary);
6451
6452 if (Offset0 || Offset1) {
6453 for (int &M : Mask)
6454 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6455 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6456 ++M;
6457 }
6458 return true;
6459 }
6460 case ISD::VSELECT:
6461 case X86ISD::BLENDV: {
6462 SDValue Cond = N.getOperand(0);
6463 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6464 Ops.push_back(N.getOperand(1));
6465 Ops.push_back(N.getOperand(2));
6466 return true;
6467 }
6468 return false;
6469 }
6470 case X86ISD::VTRUNC: {
6471 SDValue Src = N.getOperand(0);
6472 EVT SrcVT = Src.getValueType();
6473 if (SrcVT.getSizeInBits() != NumSizeInBits)
6474 return false;
6475 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6476 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6477 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6478 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6479 for (unsigned i = 0; i != NumSrcElts; ++i)
6480 Mask.push_back(i * Scale);
6481 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6482 Ops.push_back(Src);
6483 return true;
6484 }
6485 case ISD::SHL:
6486 case ISD::SRL: {
6487 APInt UndefElts;
6488 SmallVector<APInt, 32> EltBits;
6489 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6490 UndefElts, EltBits,
6491 /*AllowWholeUndefs*/ true,
6492 /*AllowPartialUndefs*/ false))
6493 return false;
6494
6495 // We can only decode 'whole byte' bit shifts as shuffles.
6496 for (unsigned I = 0; I != NumElts; ++I)
6497 if (DemandedElts[I] && !UndefElts[I] &&
6498 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6499 return false;
6500
6501 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6502 Ops.push_back(N.getOperand(0));
6503
6504 for (unsigned I = 0; I != NumElts; ++I) {
6505 if (!DemandedElts[I] || UndefElts[I])
6506 continue;
6507 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6508 unsigned Lo = I * NumBytesPerElt;
6509 unsigned Hi = Lo + NumBytesPerElt;
6510 // Clear mask to all zeros and insert the shifted byte indices.
6511 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6512 if (ISD::SHL == Opcode)
6513 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6514 else
6515 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6516 Lo + ByteShift);
6517 }
6518 return true;
6519 }
6520 case X86ISD::VSHLI:
6521 case X86ISD::VSRLI: {
6522 uint64_t ShiftVal = N.getConstantOperandVal(1);
6523 // Out of range bit shifts are guaranteed to be zero.
6524 if (NumBitsPerElt <= ShiftVal) {
6525 Mask.append(NumElts, SM_SentinelZero);
6526 return true;
6527 }
6528
6529 // We can only decode 'whole byte' bit shifts as shuffles.
6530 if ((ShiftVal % 8) != 0)
6531 break;
6532
6533 uint64_t ByteShift = ShiftVal / 8;
6534 Ops.push_back(N.getOperand(0));
6535
6536 // Clear mask to all zeros and insert the shifted byte indices.
6537 Mask.append(NumSizeInBytes, SM_SentinelZero);
6538
6539 if (X86ISD::VSHLI == Opcode) {
6540 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6541 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6542 Mask[i + j] = i + j - ByteShift;
6543 } else {
6544 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6545 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6546 Mask[i + j - ByteShift] = i + j;
6547 }
6548 return true;
6549 }
6550 case X86ISD::VROTLI:
6551 case X86ISD::VROTRI: {
6552 // We can only decode 'whole byte' bit rotates as shuffles.
6553 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6554 if ((RotateVal % 8) != 0)
6555 return false;
6556 Ops.push_back(N.getOperand(0));
6557 int Offset = RotateVal / 8;
6558 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6559 for (int i = 0; i != (int)NumElts; ++i) {
6560 int BaseIdx = i * NumBytesPerElt;
6561 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6562 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6563 }
6564 }
6565 return true;
6566 }
6567 case X86ISD::VBROADCAST: {
6568 SDValue Src = N.getOperand(0);
6569 if (!Src.getSimpleValueType().isVector()) {
6570 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6571 !isNullConstant(Src.getOperand(1)) ||
6572 Src.getOperand(0).getValueType().getScalarType() !=
6573 VT.getScalarType())
6574 return false;
6575 Src = Src.getOperand(0);
6576 }
6577 Ops.push_back(Src);
6578 Mask.append(NumElts, 0);
6579 return true;
6580 }
6582 SDValue Src = N.getOperand(0);
6583 EVT SrcVT = Src.getValueType();
6584 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6585
6586 // Extended source must be a simple vector.
6587 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6588 (NumBitsPerSrcElt % 8) != 0)
6589 return false;
6590
6591 // We can only handle all-signbits extensions.
6592 APInt DemandedSrcElts =
6593 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6594 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6595 return false;
6596
6597 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6598 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6599 for (unsigned I = 0; I != NumElts; ++I)
6600 Mask.append(Scale, I);
6601 Ops.push_back(Src);
6602 return true;
6603 }
6604 case ISD::ZERO_EXTEND:
6605 case ISD::ANY_EXTEND:
6608 SDValue Src = N.getOperand(0);
6609 EVT SrcVT = Src.getValueType();
6610
6611 // Extended source must be a simple vector.
6612 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6613 (SrcVT.getScalarSizeInBits() % 8) != 0)
6614 return false;
6615
6616 bool IsAnyExtend =
6617 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6618 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6619 IsAnyExtend, Mask);
6620 Ops.push_back(Src);
6621 return true;
6622 }
6623 }
6624
6625 return false;
6626}
6627
6628/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6630 SmallVectorImpl<int> &Mask) {
6631 int MaskWidth = Mask.size();
6632 SmallVector<SDValue, 16> UsedInputs;
6633 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6634 int lo = UsedInputs.size() * MaskWidth;
6635 int hi = lo + MaskWidth;
6636
6637 // Strip UNDEF input usage.
6638 if (Inputs[i].isUndef())
6639 for (int &M : Mask)
6640 if ((lo <= M) && (M < hi))
6641 M = SM_SentinelUndef;
6642
6643 // Check for unused inputs.
6644 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6645 for (int &M : Mask)
6646 if (lo <= M)
6647 M -= MaskWidth;
6648 continue;
6649 }
6650
6651 // Check for repeated inputs.
6652 bool IsRepeat = false;
6653 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6654 if (UsedInputs[j] != Inputs[i])
6655 continue;
6656 for (int &M : Mask)
6657 if (lo <= M)
6658 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6659 IsRepeat = true;
6660 break;
6661 }
6662 if (IsRepeat)
6663 continue;
6664
6665 UsedInputs.push_back(Inputs[i]);
6666 }
6667 Inputs = UsedInputs;
6668}
6669
6670/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6671/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6672/// Returns true if the target shuffle mask was decoded.
6673static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6676 APInt &KnownUndef, APInt &KnownZero,
6677 const SelectionDAG &DAG, unsigned Depth,
6678 bool ResolveKnownElts) {
6680 return false; // Limit search depth.
6681
6682 EVT VT = Op.getValueType();
6683 if (!VT.isSimple() || !VT.isVector())
6684 return false;
6685
6686 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6687 if (ResolveKnownElts)
6688 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6689 return true;
6690 }
6691 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6692 ResolveKnownElts)) {
6693 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6694 return true;
6695 }
6696 return false;
6697}
6698
6699static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6702 const SelectionDAG &DAG, unsigned Depth,
6703 bool ResolveKnownElts) {
6704 APInt KnownUndef, KnownZero;
6705 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6706 KnownZero, DAG, Depth, ResolveKnownElts);
6707}
6708
6711 const SelectionDAG &DAG, unsigned Depth = 0,
6712 bool ResolveKnownElts = true) {
6713 EVT VT = Op.getValueType();
6714 if (!VT.isSimple() || !VT.isVector())
6715 return false;
6716
6717 unsigned NumElts = Op.getValueType().getVectorNumElements();
6718 APInt DemandedElts = APInt::getAllOnes(NumElts);
6719 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6720 ResolveKnownElts);
6721}
6722
6723// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6724static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6725 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6726 SelectionDAG &DAG) {
6727 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6728 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6729 "Unknown broadcast load type");
6730
6731 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6732 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6733 return SDValue();
6734
6737 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6738 SDValue Ops[] = {Mem->getChain(), Ptr};
6739 SDValue BcstLd = DAG.getMemIntrinsicNode(
6740 Opcode, DL, Tys, Ops, MemVT,
6742 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6743 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6744 return BcstLd;
6745}
6746
6747/// Returns the scalar element that will make up the i'th
6748/// element of the result of the vector shuffle.
6749static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6750 SelectionDAG &DAG, unsigned Depth) {
6752 return SDValue(); // Limit search depth.
6753
6754 EVT VT = Op.getValueType();
6755 unsigned Opcode = Op.getOpcode();
6756 unsigned NumElems = VT.getVectorNumElements();
6757
6758 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6759 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6760 int Elt = SV->getMaskElt(Index);
6761
6762 if (Elt < 0)
6763 return DAG.getUNDEF(VT.getVectorElementType());
6764
6765 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6766 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6767 }
6768
6769 // Recurse into target specific vector shuffles to find scalars.
6770 if (isTargetShuffle(Opcode)) {
6771 MVT ShufVT = VT.getSimpleVT();
6772 MVT ShufSVT = ShufVT.getVectorElementType();
6773 int NumElems = (int)ShufVT.getVectorNumElements();
6774 SmallVector<int, 16> ShuffleMask;
6776 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6777 return SDValue();
6778
6779 int Elt = ShuffleMask[Index];
6780 if (Elt == SM_SentinelZero)
6781 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6782 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6783 if (Elt == SM_SentinelUndef)
6784 return DAG.getUNDEF(ShufSVT);
6785
6786 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6787 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6788 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6789 }
6790
6791 // Recurse into insert_subvector base/sub vector to find scalars.
6792 if (Opcode == ISD::INSERT_SUBVECTOR) {
6793 SDValue Vec = Op.getOperand(0);
6794 SDValue Sub = Op.getOperand(1);
6795 uint64_t SubIdx = Op.getConstantOperandVal(2);
6796 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6797
6798 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6799 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6800 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6801 }
6802
6803 // Recurse into concat_vectors sub vector to find scalars.
6804 if (Opcode == ISD::CONCAT_VECTORS) {
6805 EVT SubVT = Op.getOperand(0).getValueType();
6806 unsigned NumSubElts = SubVT.getVectorNumElements();
6807 uint64_t SubIdx = Index / NumSubElts;
6808 uint64_t SubElt = Index % NumSubElts;
6809 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6810 }
6811
6812 // Recurse into extract_subvector src vector to find scalars.
6813 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6814 SDValue Src = Op.getOperand(0);
6815 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6816 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6817 }
6818
6819 // We only peek through bitcasts of the same vector width.
6820 if (Opcode == ISD::BITCAST) {
6821 SDValue Src = Op.getOperand(0);
6822 EVT SrcVT = Src.getValueType();
6823 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6824 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6825 return SDValue();
6826 }
6827
6828 // Actual nodes that may contain scalar elements
6829
6830 // For insert_vector_elt - either return the index matching scalar or recurse
6831 // into the base vector.
6832 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6833 isa<ConstantSDNode>(Op.getOperand(2))) {
6834 if (Op.getConstantOperandAPInt(2) == Index)
6835 return Op.getOperand(1);
6836 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6837 }
6838
6839 if (Opcode == ISD::SCALAR_TO_VECTOR)
6840 return (Index == 0) ? Op.getOperand(0)
6841 : DAG.getUNDEF(VT.getVectorElementType());
6842
6843 if (Opcode == ISD::BUILD_VECTOR)
6844 return Op.getOperand(Index);
6845
6846 return SDValue();
6847}
6848
6849// Use PINSRB/PINSRW/PINSRD to create a build vector.
6851 const APInt &NonZeroMask,
6852 unsigned NumNonZero, unsigned NumZero,
6853 SelectionDAG &DAG,
6854 const X86Subtarget &Subtarget) {
6855 MVT VT = Op.getSimpleValueType();
6856 unsigned NumElts = VT.getVectorNumElements();
6857 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6858 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6859 "Illegal vector insertion");
6860
6861 SDValue V;
6862 bool First = true;
6863
6864 for (unsigned i = 0; i < NumElts; ++i) {
6865 bool IsNonZero = NonZeroMask[i];
6866 if (!IsNonZero)
6867 continue;
6868
6869 // If the build vector contains zeros or our first insertion is not the
6870 // first index then insert into zero vector to break any register
6871 // dependency else use SCALAR_TO_VECTOR.
6872 if (First) {
6873 First = false;
6874 if (NumZero || 0 != i)
6875 V = getZeroVector(VT, Subtarget, DAG, DL);
6876 else {
6877 assert(0 == i && "Expected insertion into zero-index");
6878 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6879 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6880 V = DAG.getBitcast(VT, V);
6881 continue;
6882 }
6883 }
6884 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6885 DAG.getVectorIdxConstant(i, DL));
6886 }
6887
6888 return V;
6889}
6890
6891/// Custom lower build_vector of v16i8.
6893 const APInt &NonZeroMask,
6894 unsigned NumNonZero, unsigned NumZero,
6895 SelectionDAG &DAG,
6896 const X86Subtarget &Subtarget) {
6897 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6898 return SDValue();
6899
6900 // SSE4.1 - use PINSRB to insert each byte directly.
6901 if (Subtarget.hasSSE41())
6902 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6903 DAG, Subtarget);
6904
6905 SDValue V;
6906
6907 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6908 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6909 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6910 !NonZeroMask.extractBits(2, 2).isZero()) {
6911 for (unsigned I = 0; I != 4; ++I) {
6912 if (!NonZeroMask[I])
6913 continue;
6914 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6915 if (I != 0)
6916 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6917 DAG.getConstant(I * 8, DL, MVT::i8));
6918 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6919 }
6920 assert(V && "Failed to fold v16i8 vector to zero");
6921 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6922 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6923 V = DAG.getBitcast(MVT::v8i16, V);
6924 }
6925 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6926 bool ThisIsNonZero = NonZeroMask[i];
6927 bool NextIsNonZero = NonZeroMask[i + 1];
6928 if (!ThisIsNonZero && !NextIsNonZero)
6929 continue;
6930
6931 SDValue Elt;
6932 if (ThisIsNonZero) {
6933 if (NumZero || NextIsNonZero)
6934 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6935 else
6936 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6937 }
6938
6939 if (NextIsNonZero) {
6940 SDValue NextElt = Op.getOperand(i + 1);
6941 if (i == 0 && NumZero)
6942 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6943 else
6944 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6945 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6946 DAG.getConstant(8, DL, MVT::i8));
6947 if (ThisIsNonZero)
6948 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6949 else
6950 Elt = NextElt;
6951 }
6952
6953 // If our first insertion is not the first index or zeros are needed, then
6954 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6955 // elements undefined).
6956 if (!V) {
6957 if (i != 0 || NumZero)
6958 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6959 else {
6960 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6961 V = DAG.getBitcast(MVT::v8i16, V);
6962 continue;
6963 }
6964 }
6965 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6966 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6967 DAG.getVectorIdxConstant(i / 2, DL));
6968 }
6969
6970 return DAG.getBitcast(MVT::v16i8, V);
6971}
6972
6973/// Custom lower build_vector of v8i16.
6975 const APInt &NonZeroMask,
6976 unsigned NumNonZero, unsigned NumZero,
6977 SelectionDAG &DAG,
6978 const X86Subtarget &Subtarget) {
6979 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6980 return SDValue();
6981
6982 // Use PINSRW to insert each byte directly.
6983 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6984 Subtarget);
6985}
6986
6987/// Custom lower build_vector of v4i32 or v4f32.
6989 SelectionDAG &DAG,
6990 const X86Subtarget &Subtarget) {
6991 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6992 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6993 // Because we're creating a less complicated build vector here, we may enable
6994 // further folding of the MOVDDUP via shuffle transforms.
6995 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6996 Op.getOperand(0) == Op.getOperand(2) &&
6997 Op.getOperand(1) == Op.getOperand(3) &&
6998 Op.getOperand(0) != Op.getOperand(1)) {
6999 MVT VT = Op.getSimpleValueType();
7000 MVT EltVT = VT.getVectorElementType();
7001 // Create a new build vector with the first 2 elements followed by undef
7002 // padding, bitcast to v2f64, duplicate, and bitcast back.
7003 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7004 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7005 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7006 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7007 return DAG.getBitcast(VT, Dup);
7008 }
7009
7010 // Find all zeroable elements.
7011 std::bitset<4> Zeroable, Undefs;
7012 for (int i = 0; i < 4; ++i) {
7013 SDValue Elt = Op.getOperand(i);
7014 Undefs[i] = Elt.isUndef();
7015 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7016 }
7017 assert(Zeroable.size() - Zeroable.count() > 1 &&
7018 "We expect at least two non-zero elements!");
7019
7020 // We only know how to deal with build_vector nodes where elements are either
7021 // zeroable or extract_vector_elt with constant index.
7022 SDValue FirstNonZero;
7023 unsigned FirstNonZeroIdx;
7024 for (unsigned i = 0; i < 4; ++i) {
7025 if (Zeroable[i])
7026 continue;
7027 SDValue Elt = Op.getOperand(i);
7028 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7030 return SDValue();
7031 // Make sure that this node is extracting from a 128-bit vector.
7032 MVT VT = Elt.getOperand(0).getSimpleValueType();
7033 if (!VT.is128BitVector())
7034 return SDValue();
7035 if (!FirstNonZero.getNode()) {
7036 FirstNonZero = Elt;
7037 FirstNonZeroIdx = i;
7038 }
7039 }
7040
7041 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7042 SDValue V1 = FirstNonZero.getOperand(0);
7043 MVT VT = V1.getSimpleValueType();
7044
7045 // See if this build_vector can be lowered as a blend with zero.
7046 SDValue Elt;
7047 unsigned EltMaskIdx, EltIdx;
7048 int Mask[4];
7049 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7050 if (Zeroable[EltIdx]) {
7051 // The zero vector will be on the right hand side.
7052 Mask[EltIdx] = EltIdx+4;
7053 continue;
7054 }
7055
7056 Elt = Op->getOperand(EltIdx);
7057 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7058 EltMaskIdx = Elt.getConstantOperandVal(1);
7059 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7060 break;
7061 Mask[EltIdx] = EltIdx;
7062 }
7063
7064 if (EltIdx == 4) {
7065 // Let the shuffle legalizer deal with blend operations.
7066 SDValue VZeroOrUndef = (Zeroable == Undefs)
7067 ? DAG.getUNDEF(VT)
7068 : getZeroVector(VT, Subtarget, DAG, DL);
7069 if (V1.getSimpleValueType() != VT)
7070 V1 = DAG.getBitcast(VT, V1);
7071 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7072 }
7073
7074 // See if we can lower this build_vector to a INSERTPS.
7075 if (!Subtarget.hasSSE41())
7076 return SDValue();
7077
7078 SDValue V2 = Elt.getOperand(0);
7079 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7080 V1 = SDValue();
7081
7082 bool CanFold = true;
7083 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7084 if (Zeroable[i])
7085 continue;
7086
7087 SDValue Current = Op->getOperand(i);
7088 SDValue SrcVector = Current->getOperand(0);
7089 if (!V1.getNode())
7090 V1 = SrcVector;
7091 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7092 }
7093
7094 if (!CanFold)
7095 return SDValue();
7096
7097 assert(V1.getNode() && "Expected at least two non-zero elements!");
7098 if (V1.getSimpleValueType() != MVT::v4f32)
7099 V1 = DAG.getBitcast(MVT::v4f32, V1);
7100 if (V2.getSimpleValueType() != MVT::v4f32)
7101 V2 = DAG.getBitcast(MVT::v4f32, V2);
7102
7103 // Ok, we can emit an INSERTPS instruction.
7104 unsigned ZMask = Zeroable.to_ulong();
7105
7106 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7107 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7108 SDValue Result =
7109 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7110 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7111 return DAG.getBitcast(VT, Result);
7112}
7113
7114/// Return a vector logical shift node.
7115static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7116 SelectionDAG &DAG, const TargetLowering &TLI,
7117 const SDLoc &dl) {
7118 assert(VT.is128BitVector() && "Unknown type for VShift");
7119 MVT ShVT = MVT::v16i8;
7120 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7121 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7122 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7123 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7124 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7125}
7126
7128 SelectionDAG &DAG) {
7129
7130 // Check if the scalar load can be widened into a vector load. And if
7131 // the address is "base + cst" see if the cst can be "absorbed" into
7132 // the shuffle mask.
7134 SDValue Ptr = LD->getBasePtr();
7135 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7136 return SDValue();
7137 EVT PVT = LD->getValueType(0);
7138 if (PVT != MVT::i32 && PVT != MVT::f32)
7139 return SDValue();
7140
7141 int FI = -1;
7142 int64_t Offset = 0;
7144 FI = FINode->getIndex();
7145 Offset = 0;
7146 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7147 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7148 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7149 Offset = Ptr.getConstantOperandVal(1);
7150 Ptr = Ptr.getOperand(0);
7151 } else {
7152 return SDValue();
7153 }
7154
7155 // FIXME: 256-bit vector instructions don't require a strict alignment,
7156 // improve this code to support it better.
7157 Align RequiredAlign(VT.getSizeInBits() / 8);
7158 SDValue Chain = LD->getChain();
7159 // Make sure the stack object alignment is at least 16 or 32.
7161 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7162 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7163 if (MFI.isFixedObjectIndex(FI)) {
7164 // Can't change the alignment. FIXME: It's possible to compute
7165 // the exact stack offset and reference FI + adjust offset instead.
7166 // If someone *really* cares about this. That's the way to implement it.
7167 return SDValue();
7168 } else {
7169 MFI.setObjectAlignment(FI, RequiredAlign);
7170 }
7171 }
7172
7173 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7174 // Ptr + (Offset & ~15).
7175 if (Offset < 0)
7176 return SDValue();
7177 if ((Offset % RequiredAlign.value()) & 3)
7178 return SDValue();
7179 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7180 if (StartOffset) {
7181 SDLoc DL(Ptr);
7182 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7183 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7184 }
7185
7186 int EltNo = (Offset - StartOffset) >> 2;
7187 unsigned NumElems = VT.getVectorNumElements();
7188
7189 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7190 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7191 LD->getPointerInfo().getWithOffset(StartOffset));
7192
7193 SmallVector<int, 8> Mask(NumElems, EltNo);
7194
7195 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7196 }
7197
7198 return SDValue();
7199}
7200
7201// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7202static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7203 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7204 auto *BaseLd = cast<LoadSDNode>(Elt);
7205 if (!BaseLd->isSimple())
7206 return false;
7207 Ld = BaseLd;
7208 ByteOffset = 0;
7209 return true;
7210 }
7211
7212 switch (Elt.getOpcode()) {
7213 case ISD::BITCAST:
7214 case ISD::TRUNCATE:
7216 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7217 case ISD::SRL:
7218 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7219 uint64_t Amt = AmtC->getZExtValue();
7220 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7221 ByteOffset += Amt / 8;
7222 return true;
7223 }
7224 }
7225 break;
7227 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7228 SDValue Src = Elt.getOperand(0);
7229 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7230 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7231 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7232 findEltLoadSrc(Src, Ld, ByteOffset)) {
7233 uint64_t Idx = IdxC->getZExtValue();
7234 ByteOffset += Idx * (SrcSizeInBits / 8);
7235 return true;
7236 }
7237 }
7238 break;
7239 }
7240
7241 return false;
7242}
7243
7244/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7245/// elements can be replaced by a single large load which has the same value as
7246/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7247///
7248/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7250 const SDLoc &DL, SelectionDAG &DAG,
7251 const X86Subtarget &Subtarget,
7252 bool IsAfterLegalize) {
7253 if ((VT.getScalarSizeInBits() % 8) != 0)
7254 return SDValue();
7255
7256 unsigned NumElems = Elts.size();
7257
7258 int LastLoadedElt = -1;
7259 APInt LoadMask = APInt::getZero(NumElems);
7260 APInt ZeroMask = APInt::getZero(NumElems);
7261 APInt UndefMask = APInt::getZero(NumElems);
7262
7263 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7264 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7265
7266 // For each element in the initializer, see if we've found a load, zero or an
7267 // undef.
7268 for (unsigned i = 0; i < NumElems; ++i) {
7269 SDValue Elt = peekThroughBitcasts(Elts[i]);
7270 if (!Elt.getNode())
7271 return SDValue();
7272 if (Elt.isUndef()) {
7273 UndefMask.setBit(i);
7274 continue;
7275 }
7277 ZeroMask.setBit(i);
7278 continue;
7279 }
7280
7281 // Each loaded element must be the correct fractional portion of the
7282 // requested vector load.
7283 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7284 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7285 return SDValue();
7286
7287 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7288 return SDValue();
7289 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7290 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7291 return SDValue();
7292
7293 LoadMask.setBit(i);
7294 LastLoadedElt = i;
7295 }
7296 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7297 NumElems &&
7298 "Incomplete element masks");
7299
7300 // Handle Special Cases - all undef or undef/zero.
7301 if (UndefMask.popcount() == NumElems)
7302 return DAG.getUNDEF(VT);
7303 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7304 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7305 : DAG.getConstantFP(0.0, DL, VT);
7306
7307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7308 int FirstLoadedElt = LoadMask.countr_zero();
7309 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7310 EVT EltBaseVT = EltBase.getValueType();
7311 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7312 "Register/Memory size mismatch");
7313 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7314 assert(LDBase && "Did not find base load for merging consecutive loads");
7315 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7316 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7317 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7318 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7319 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7320
7321 // TODO: Support offsetting the base load.
7322 if (ByteOffsets[FirstLoadedElt] != 0)
7323 return SDValue();
7324
7325 // Check to see if the element's load is consecutive to the base load
7326 // or offset from a previous (already checked) load.
7327 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7328 LoadSDNode *Ld = Loads[EltIdx];
7329 int64_t ByteOffset = ByteOffsets[EltIdx];
7330 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7331 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7332 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7333 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7334 }
7335 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7336 EltIdx - FirstLoadedElt);
7337 };
7338
7339 // Consecutive loads can contain UNDEFS but not ZERO elements.
7340 // Consecutive loads with UNDEFs and ZEROs elements require a
7341 // an additional shuffle stage to clear the ZERO elements.
7342 bool IsConsecutiveLoad = true;
7343 bool IsConsecutiveLoadWithZeros = true;
7344 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7345 if (LoadMask[i]) {
7346 if (!CheckConsecutiveLoad(LDBase, i)) {
7347 IsConsecutiveLoad = false;
7348 IsConsecutiveLoadWithZeros = false;
7349 break;
7350 }
7351 } else if (ZeroMask[i]) {
7352 IsConsecutiveLoad = false;
7353 }
7354 }
7355
7356 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7357 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7358 assert(LDBase->isSimple() &&
7359 "Cannot merge volatile or atomic loads.");
7360 SDValue NewLd =
7361 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7362 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7363 for (auto *LD : Loads)
7364 if (LD)
7365 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7366 return NewLd;
7367 };
7368
7369 // Check if the base load is entirely dereferenceable.
7370 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7371 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7372
7373 // LOAD - all consecutive load/undefs (must start/end with a load or be
7374 // entirely dereferenceable). If we have found an entire vector of loads and
7375 // undefs, then return a large load of the entire vector width starting at the
7376 // base pointer. If the vector contains zeros, then attempt to shuffle those
7377 // elements.
7378 if (FirstLoadedElt == 0 &&
7379 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7380 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7381 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7382 return SDValue();
7383
7384 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7385 // will lower to regular temporal loads and use the cache.
7386 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7387 VT.is256BitVector() && !Subtarget.hasInt256())
7388 return SDValue();
7389
7390 if (NumElems == 1)
7391 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7392
7393 if (!ZeroMask)
7394 return CreateLoad(VT, LDBase);
7395
7396 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7397 // vector and a zero vector to clear out the zero elements.
7398 if (!IsAfterLegalize && VT.isVector()) {
7399 unsigned NumMaskElts = VT.getVectorNumElements();
7400 if ((NumMaskElts % NumElems) == 0) {
7401 unsigned Scale = NumMaskElts / NumElems;
7402 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7403 for (unsigned i = 0; i < NumElems; ++i) {
7404 if (UndefMask[i])
7405 continue;
7406 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7407 for (unsigned j = 0; j != Scale; ++j)
7408 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7409 }
7410 SDValue V = CreateLoad(VT, LDBase);
7411 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7412 : DAG.getConstantFP(0.0, DL, VT);
7413 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7414 }
7415 }
7416 }
7417
7418 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7419 if (VT.is256BitVector() || VT.is512BitVector()) {
7420 unsigned HalfNumElems = NumElems / 2;
7421 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7422 EVT HalfVT =
7423 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7424 SDValue HalfLD =
7425 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7426 DAG, Subtarget, IsAfterLegalize);
7427 if (HalfLD)
7428 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7429 HalfLD, DAG.getVectorIdxConstant(0, DL));
7430 }
7431 }
7432
7433 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7434 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7435 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7436 LoadSizeInBits == 64) &&
7437 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7438 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7439 : MVT::getIntegerVT(LoadSizeInBits);
7440 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7441 // Allow v4f32 on SSE1 only targets.
7442 // FIXME: Add more isel patterns so we can just use VT directly.
7443 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7444 VecVT = MVT::v4f32;
7445 if (TLI.isTypeLegal(VecVT)) {
7446 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7447 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7448 SDValue ResNode = DAG.getMemIntrinsicNode(
7449 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7451 for (auto *LD : Loads)
7452 if (LD)
7453 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7454 return DAG.getBitcast(VT, ResNode);
7455 }
7456 }
7457
7458 // BROADCAST - match the smallest possible repetition pattern, load that
7459 // scalar/subvector element and then broadcast to the entire vector.
7460 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7461 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7462 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7463 unsigned RepeatSize = SubElems * BaseSizeInBits;
7464 unsigned ScalarSize = std::min(RepeatSize, 64u);
7465 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7466 continue;
7467
7468 // Don't attempt a 1:N subvector broadcast - it should be caught by
7469 // combineConcatVectorOps, else will cause infinite loops.
7470 if (RepeatSize > ScalarSize && SubElems == 1)
7471 continue;
7472
7473 bool Match = true;
7474 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7475 for (unsigned i = 0; i != NumElems && Match; ++i) {
7476 if (!LoadMask[i])
7477 continue;
7478 SDValue Elt = peekThroughBitcasts(Elts[i]);
7479 if (RepeatedLoads[i % SubElems].isUndef())
7480 RepeatedLoads[i % SubElems] = Elt;
7481 else
7482 Match &= (RepeatedLoads[i % SubElems] == Elt);
7483 }
7484
7485 // We must have loads at both ends of the repetition.
7486 Match &= !RepeatedLoads.front().isUndef();
7487 Match &= !RepeatedLoads.back().isUndef();
7488 if (!Match)
7489 continue;
7490
7491 EVT RepeatVT =
7492 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7493 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7494 : EVT::getFloatingPointVT(ScalarSize);
7495 if (RepeatSize > ScalarSize)
7496 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7497 RepeatSize / ScalarSize);
7498 EVT BroadcastVT =
7499 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7500 VT.getSizeInBits() / ScalarSize);
7501 if (TLI.isTypeLegal(BroadcastVT)) {
7502 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7503 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7504 SDValue Broadcast = RepeatLoad;
7505 if (RepeatSize > ScalarSize) {
7506 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7507 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7508 } else {
7509 if (!Subtarget.hasAVX2() &&
7511 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7512 Subtarget,
7513 /*AssumeSingleUse=*/true))
7514 return SDValue();
7515 Broadcast =
7516 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7517 }
7518 return DAG.getBitcast(VT, Broadcast);
7519 }
7520 }
7521 }
7522 }
7523
7524 return SDValue();
7525}
7526
7527// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7528// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7529// are consecutive, non-overlapping, and in the right order.
7531 SelectionDAG &DAG,
7532 const X86Subtarget &Subtarget,
7533 bool IsAfterLegalize) {
7535 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7536 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7537 Elts.push_back(Elt);
7538 continue;
7539 }
7540 return SDValue();
7541 }
7542 assert(Elts.size() == VT.getVectorNumElements());
7543 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7544 IsAfterLegalize);
7545}
7546
7548 const APInt &Undefs, LLVMContext &C) {
7549 unsigned ScalarSize = VT.getScalarSizeInBits();
7550 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7551
7552 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7553 if (VT.isFloatingPoint()) {
7554 if (ScalarSize == 16)
7555 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7556 if (ScalarSize == 32)
7557 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7558 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7559 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7560 }
7561 return Constant::getIntegerValue(Ty, Val);
7562 };
7563
7564 SmallVector<Constant *, 32> ConstantVec;
7565 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7566 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7567 : getConstantScalar(Bits[I]));
7568
7569 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7570}
7571
7572static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7573 unsigned SplatBitSize, LLVMContext &C) {
7574 unsigned ScalarSize = VT.getScalarSizeInBits();
7575
7576 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7577 if (VT.isFloatingPoint()) {
7578 if (ScalarSize == 16)
7579 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7580 if (ScalarSize == 32)
7581 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7582 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7583 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7584 }
7585 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7586 };
7587
7588 if (ScalarSize == SplatBitSize)
7589 return getConstantScalar(SplatValue);
7590
7591 unsigned NumElm = SplatBitSize / ScalarSize;
7592 SmallVector<Constant *, 32> ConstantVec;
7593 for (unsigned I = 0; I != NumElm; ++I) {
7594 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7595 ConstantVec.push_back(getConstantScalar(Val));
7596 }
7597 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7598}
7599
7601 for (auto *U : N->users()) {
7602 unsigned Opc = U->getOpcode();
7603 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7604 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7605 return false;
7606 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7607 return false;
7608 if (isTargetShuffle(Opc))
7609 return true;
7610 if (Opc == ISD::BITCAST) // Ignore bitcasts
7611 return isFoldableUseOfShuffle(U);
7612 if (N->hasOneUse()) {
7613 // TODO, there may be some general way to know if a SDNode can
7614 // be folded. We now only know whether an MI is foldable.
7615 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7616 return false;
7617 return true;
7618 }
7619 }
7620 return false;
7621}
7622
7623// If the node has a single use by a VSELECT then AVX512 targets may be able to
7624// fold as a predicated instruction.
7625static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7626 unsigned SizeInBits = V.getValueSizeInBits();
7627 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7628 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7629 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7630 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7631 return true;
7632 }
7633 }
7634 return false;
7635}
7636
7637/// Attempt to use the vbroadcast instruction to generate a splat value
7638/// from a splat BUILD_VECTOR which uses:
7639/// a. A single scalar load, or a constant.
7640/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7641///
7642/// The VBROADCAST node is returned when a pattern is found,
7643/// or SDValue() otherwise.
7645 const SDLoc &dl,
7646 const X86Subtarget &Subtarget,
7647 SelectionDAG &DAG) {
7648 // VBROADCAST requires AVX.
7649 // TODO: Splats could be generated for non-AVX CPUs using SSE
7650 // instructions, but there's less potential gain for only 128-bit vectors.
7651 if (!Subtarget.hasAVX())
7652 return SDValue();
7653
7654 MVT VT = BVOp->getSimpleValueType(0);
7655 unsigned NumElts = VT.getVectorNumElements();
7656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7657 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7658 "Unsupported vector type for broadcast.");
7659
7660 // See if the build vector is a repeating sequence of scalars (inc. splat).
7661 SDValue Ld;
7662 BitVector UndefElements;
7663 SmallVector<SDValue, 16> Sequence;
7664 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7665 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7666 if (Sequence.size() == 1)
7667 Ld = Sequence[0];
7668 }
7669
7670 // Attempt to use VBROADCASTM
7671 // From this pattern:
7672 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7673 // b. t1 = (build_vector t0 t0)
7674 //
7675 // Create (VBROADCASTM v2i1 X)
7676 if (!Sequence.empty() && Subtarget.hasCDI()) {
7677 // If not a splat, are the upper sequence values zeroable?
7678 unsigned SeqLen = Sequence.size();
7679 bool UpperZeroOrUndef =
7680 SeqLen == 1 ||
7681 llvm::all_of(ArrayRef(Sequence).drop_front(),
7682 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7683 SDValue Op0 = Sequence[0];
7684 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7685 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7686 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7687 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7688 ? Op0.getOperand(0)
7689 : Op0.getOperand(0).getOperand(0);
7690 MVT MaskVT = BOperand.getSimpleValueType();
7691 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7692 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7693 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7694 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7695 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7696 unsigned Scale = 512 / VT.getSizeInBits();
7697 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7698 }
7699 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7700 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7701 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7702 return DAG.getBitcast(VT, Bcst);
7703 }
7704 }
7705 }
7706
7707 unsigned NumUndefElts = UndefElements.count();
7708 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7709 APInt SplatValue, Undef;
7710 unsigned SplatBitSize;
7711 bool HasUndef;
7712 // Check if this is a repeated constant pattern suitable for broadcasting.
7713 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7714 SplatBitSize > VT.getScalarSizeInBits() &&
7715 SplatBitSize < VT.getSizeInBits()) {
7716 // Avoid replacing with broadcast when it's a use of a shuffle
7717 // instruction to preserve the present custom lowering of shuffles.
7718 if (isFoldableUseOfShuffle(BVOp))
7719 return SDValue();
7720 // replace BUILD_VECTOR with broadcast of the repeated constants.
7721 LLVMContext *Ctx = DAG.getContext();
7722 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7723 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7724 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7725 // Load the constant scalar/subvector and broadcast it.
7726 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7727 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7728 SDValue CP = DAG.getConstantPool(C, PVT);
7729 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7730
7731 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7732 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7733 SDValue Ops[] = {DAG.getEntryNode(), CP};
7734 MachinePointerInfo MPI =
7736 SDValue Brdcst =
7738 MPI, Alignment, MachineMemOperand::MOLoad);
7739 return DAG.getBitcast(VT, Brdcst);
7740 }
7741 if (SplatBitSize > 64) {
7742 // Load the vector of constants and broadcast it.
7743 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7744 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7745 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7746 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7747 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7748 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7749 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7750 MachinePointerInfo MPI =
7753 Ops, VVT, MPI, Alignment,
7755 }
7756 }
7757
7758 // If we are moving a scalar into a vector (Ld must be set and all elements
7759 // but 1 are undef) and that operation is not obviously supported by
7760 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7761 // That's better than general shuffling and may eliminate a load to GPR and
7762 // move from scalar to vector register.
7763 if (!Ld || NumElts - NumUndefElts != 1)
7764 return SDValue();
7765 unsigned ScalarSize = Ld.getValueSizeInBits();
7766 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7767 return SDValue();
7768 }
7769
7770 bool ConstSplatVal =
7771 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7772 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7773
7774 // TODO: Handle broadcasts of non-constant sequences.
7775
7776 // Make sure that all of the users of a non-constant load are from the
7777 // BUILD_VECTOR node.
7778 // FIXME: Is the use count needed for non-constant, non-load case?
7779 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7780 return SDValue();
7781
7782 unsigned ScalarSize = Ld.getValueSizeInBits();
7783 bool IsGE256 = (VT.getSizeInBits() >= 256);
7784
7785 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7786 // instruction to save 8 or more bytes of constant pool data.
7787 // TODO: If multiple splats are generated to load the same constant,
7788 // it may be detrimental to overall size. There needs to be a way to detect
7789 // that condition to know if this is truly a size win.
7790 bool OptForSize = DAG.shouldOptForSize();
7791
7792 // Handle broadcasting a single constant scalar from the constant pool
7793 // into a vector.
7794 // On Sandybridge (no AVX2), it is still better to load a constant vector
7795 // from the constant pool and not to broadcast it from a scalar.
7796 // But override that restriction when optimizing for size.
7797 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7798 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7799 EVT CVT = Ld.getValueType();
7800 assert(!CVT.isVector() && "Must not broadcast a vector type");
7801
7802 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7803 // For size optimization, also splat v2f64 and v2i64, and for size opt
7804 // with AVX2, also splat i8 and i16.
7805 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7806 if (ScalarSize == 32 ||
7807 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7808 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7809 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7810 const Constant *C = nullptr;
7812 C = CI->getConstantIntValue();
7814 C = CF->getConstantFPValue();
7815
7816 assert(C && "Invalid constant type");
7817
7818 SDValue CP =
7820 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7821
7822 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7823 SDValue Ops[] = {DAG.getEntryNode(), CP};
7824 MachinePointerInfo MPI =
7826 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7827 MPI, Alignment, MachineMemOperand::MOLoad);
7828 }
7829 }
7830
7831 // Handle AVX2 in-register broadcasts.
7832 if (!IsLoad && Subtarget.hasInt256() &&
7833 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7834 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7835
7836 // The scalar source must be a normal load.
7837 if (!IsLoad)
7838 return SDValue();
7839
7840 // Make sure the non-chain result is only used by this build vector.
7841 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7842 return SDValue();
7843
7844 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7845 (Subtarget.hasVLX() && ScalarSize == 64)) {
7846 auto *LN = cast<LoadSDNode>(Ld);
7847 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7848 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7849 SDValue BCast =
7851 LN->getMemoryVT(), LN->getMemOperand());
7852 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7853 return BCast;
7854 }
7855
7856 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7857 // double since there is no vbroadcastsd xmm
7858 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7859 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7860 auto *LN = cast<LoadSDNode>(Ld);
7861 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7862 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7863 SDValue BCast =
7865 LN->getMemoryVT(), LN->getMemOperand());
7866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7867 return BCast;
7868 }
7869
7870 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7871 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7872
7873 // Unsupported broadcast.
7874 return SDValue();
7875}
7876
7877/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7878/// underlying vector and index.
7879///
7880/// Modifies \p ExtractedFromVec to the real vector and returns the real
7881/// index.
7882static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7883 SDValue ExtIdx) {
7884 int Idx = ExtIdx->getAsZExtVal();
7885 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7886 return Idx;
7887
7888 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7889 // lowered this:
7890 // (extract_vector_elt (v8f32 %1), Constant<6>)
7891 // to:
7892 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7893 // (extract_subvector (v8f32 %0), Constant<4>),
7894 // undef)
7895 // Constant<0>)
7896 // In this case the vector is the extract_subvector expression and the index
7897 // is 2, as specified by the shuffle.
7898 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7899 SDValue ShuffleVec = SVOp->getOperand(0);
7900 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7901 assert(ShuffleVecVT.getVectorElementType() ==
7902 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7903
7904 int ShuffleIdx = SVOp->getMaskElt(Idx);
7905 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7906 ExtractedFromVec = ShuffleVec;
7907 return ShuffleIdx;
7908 }
7909 return Idx;
7910}
7911
7913 SelectionDAG &DAG) {
7914 MVT VT = Op.getSimpleValueType();
7915
7916 // Skip if insert_vec_elt is not supported.
7917 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7919 return SDValue();
7920
7921 unsigned NumElems = Op.getNumOperands();
7922 SDValue VecIn1;
7923 SDValue VecIn2;
7924 SmallVector<unsigned, 4> InsertIndices;
7925 SmallVector<int, 8> Mask(NumElems, -1);
7926
7927 for (unsigned i = 0; i != NumElems; ++i) {
7928 unsigned Opc = Op.getOperand(i).getOpcode();
7929
7930 if (Opc == ISD::UNDEF)
7931 continue;
7932
7934 // Quit if more than 1 elements need inserting.
7935 if (InsertIndices.size() > 1)
7936 return SDValue();
7937
7938 InsertIndices.push_back(i);
7939 continue;
7940 }
7941
7942 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7943 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7944
7945 // Quit if non-constant index.
7946 if (!isa<ConstantSDNode>(ExtIdx))
7947 return SDValue();
7948 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7949
7950 // Quit if extracted from vector of different type.
7951 if (ExtractedFromVec.getValueType() != VT)
7952 return SDValue();
7953
7954 if (!VecIn1.getNode())
7955 VecIn1 = ExtractedFromVec;
7956 else if (VecIn1 != ExtractedFromVec) {
7957 if (!VecIn2.getNode())
7958 VecIn2 = ExtractedFromVec;
7959 else if (VecIn2 != ExtractedFromVec)
7960 // Quit if more than 2 vectors to shuffle
7961 return SDValue();
7962 }
7963
7964 if (ExtractedFromVec == VecIn1)
7965 Mask[i] = Idx;
7966 else if (ExtractedFromVec == VecIn2)
7967 Mask[i] = Idx + NumElems;
7968 }
7969
7970 if (!VecIn1.getNode())
7971 return SDValue();
7972
7973 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7974 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7975
7976 for (unsigned Idx : InsertIndices)
7977 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7978 DAG.getVectorIdxConstant(Idx, DL));
7979
7980 return NV;
7981}
7982
7983// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7985 const X86Subtarget &Subtarget) {
7986 MVT VT = Op.getSimpleValueType();
7987 MVT IVT =
7988 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7990 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7991 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7992 Op.getOperand(I)));
7993 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7994 return DAG.getBitcast(VT, Res);
7995}
7996
7997// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7999 SelectionDAG &DAG,
8000 const X86Subtarget &Subtarget) {
8001
8002 MVT VT = Op.getSimpleValueType();
8003 assert((VT.getVectorElementType() == MVT::i1) &&
8004 "Unexpected type in LowerBUILD_VECTORvXi1!");
8005 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8006 ISD::isBuildVectorAllOnes(Op.getNode()))
8007 return Op;
8008
8009 uint64_t Immediate = 0;
8010 SmallVector<unsigned, 16> NonConstIdx;
8011 bool IsSplat = true;
8012 bool HasConstElts = false;
8013 int SplatIdx = -1;
8014 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8015 SDValue In = Op.getOperand(idx);
8016 if (In.isUndef())
8017 continue;
8018 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8019 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8020 HasConstElts = true;
8021 } else {
8022 NonConstIdx.push_back(idx);
8023 }
8024 if (SplatIdx < 0)
8025 SplatIdx = idx;
8026 else if (In != Op.getOperand(SplatIdx))
8027 IsSplat = false;
8028 }
8029
8030 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8031 if (IsSplat) {
8032 // The build_vector allows the scalar element to be larger than the vector
8033 // element type. We need to mask it to use as a condition unless we know
8034 // the upper bits are zero.
8035 // FIXME: Use computeKnownBits instead of checking specific opcode?
8036 SDValue Cond = Op.getOperand(SplatIdx);
8037 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8038 if (Cond.getOpcode() != ISD::SETCC)
8039 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8040 DAG.getConstant(1, dl, MVT::i8));
8041
8042 // Perform the select in the scalar domain so we can use cmov.
8043 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8044 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8045 DAG.getAllOnesConstant(dl, MVT::i32),
8046 DAG.getConstant(0, dl, MVT::i32));
8047 Select = DAG.getBitcast(MVT::v32i1, Select);
8048 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8049 } else {
8050 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8051 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8052 DAG.getAllOnesConstant(dl, ImmVT),
8053 DAG.getConstant(0, dl, ImmVT));
8054 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8055 Select = DAG.getBitcast(VecVT, Select);
8056 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8057 DAG.getVectorIdxConstant(0, dl));
8058 }
8059 }
8060
8061 // insert elements one by one
8062 SDValue DstVec;
8063 if (HasConstElts) {
8064 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8065 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8066 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8067 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8068 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8069 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8070 } else {
8071 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8072 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8073 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8074 DstVec = DAG.getBitcast(VecVT, Imm);
8075 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8076 DAG.getVectorIdxConstant(0, dl));
8077 }
8078 } else
8079 DstVec = DAG.getUNDEF(VT);
8080
8081 for (unsigned InsertIdx : NonConstIdx) {
8082 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8083 Op.getOperand(InsertIdx),
8084 DAG.getVectorIdxConstant(InsertIdx, dl));
8085 }
8086 return DstVec;
8087}
8088
8089LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
8090 switch (Opcode) {
8091 case X86ISD::PACKSS:
8092 case X86ISD::PACKUS:
8093 case X86ISD::FHADD:
8094 case X86ISD::FHSUB:
8095 case X86ISD::HADD:
8096 case X86ISD::HSUB:
8097 return true;
8098 }
8099 return false;
8100}
8101
8102/// This is a helper function of LowerToHorizontalOp().
8103/// This function checks that the build_vector \p N in input implements a
8104/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8105/// may not match the layout of an x86 256-bit horizontal instruction.
8106/// In other words, if this returns true, then some extraction/insertion will
8107/// be required to produce a valid horizontal instruction.
8108///
8109/// Parameter \p Opcode defines the kind of horizontal operation to match.
8110/// For example, if \p Opcode is equal to ISD::ADD, then this function
8111/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8112/// is equal to ISD::SUB, then this function checks if this is a horizontal
8113/// arithmetic sub.
8114///
8115/// This function only analyzes elements of \p N whose indices are
8116/// in range [BaseIdx, LastIdx).
8117///
8118/// TODO: This function was originally used to match both real and fake partial
8119/// horizontal operations, but the index-matching logic is incorrect for that.
8120/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8121/// code because it is only used for partial h-op matching now?
8122static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8123 const SDLoc &DL, SelectionDAG &DAG,
8124 unsigned BaseIdx, unsigned LastIdx,
8125 SDValue &V0, SDValue &V1) {
8126 EVT VT = N->getValueType(0);
8127 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8128 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8129 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8130 "Invalid Vector in input!");
8131
8132 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8133 bool CanFold = true;
8134 unsigned ExpectedVExtractIdx = BaseIdx;
8135 unsigned NumElts = LastIdx - BaseIdx;
8136 V0 = DAG.getUNDEF(VT);
8137 V1 = DAG.getUNDEF(VT);
8138
8139 // Check if N implements a horizontal binop.
8140 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8141 SDValue Op = N->getOperand(i + BaseIdx);
8142
8143 // Skip UNDEFs.
8144 if (Op->isUndef()) {
8145 // Update the expected vector extract index.
8146 if (i * 2 == NumElts)
8147 ExpectedVExtractIdx = BaseIdx;
8148 ExpectedVExtractIdx += 2;
8149 continue;
8150 }
8151
8152 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8153
8154 if (!CanFold)
8155 break;
8156
8157 SDValue Op0 = Op.getOperand(0);
8158 SDValue Op1 = Op.getOperand(1);
8159
8160 // Try to match the following pattern:
8161 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8162 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8164 Op0.getOperand(0) == Op1.getOperand(0) &&
8167 if (!CanFold)
8168 break;
8169
8170 unsigned I0 = Op0.getConstantOperandVal(1);
8171 unsigned I1 = Op1.getConstantOperandVal(1);
8172
8173 if (i * 2 < NumElts) {
8174 if (V0.isUndef()) {
8175 V0 = Op0.getOperand(0);
8176 if (V0.getValueType() != VT)
8177 return false;
8178 }
8179 } else {
8180 if (V1.isUndef()) {
8181 V1 = Op0.getOperand(0);
8182 if (V1.getValueType() != VT)
8183 return false;
8184 }
8185 if (i * 2 == NumElts)
8186 ExpectedVExtractIdx = BaseIdx;
8187 }
8188
8189 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8190 if (I0 == ExpectedVExtractIdx)
8191 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8192 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8193 // Try to match the following dag sequence:
8194 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8195 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8196 } else
8197 CanFold = false;
8198
8199 ExpectedVExtractIdx += 2;
8200 }
8201
8202 return CanFold;
8203}
8204
8205/// Emit a sequence of two 128-bit horizontal add/sub followed by
8206/// a concat_vector.
8207///
8208/// This is a helper function of LowerToHorizontalOp().
8209/// This function expects two 256-bit vectors called V0 and V1.
8210/// At first, each vector is split into two separate 128-bit vectors.
8211/// Then, the resulting 128-bit vectors are used to implement two
8212/// horizontal binary operations.
8213///
8214/// The kind of horizontal binary operation is defined by \p X86Opcode.
8215///
8216/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8217/// the two new horizontal binop.
8218/// When Mode is set, the first horizontal binop dag node would take as input
8219/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8220/// horizontal binop dag node would take as input the lower 128-bit of V1
8221/// and the upper 128-bit of V1.
8222/// Example:
8223/// HADD V0_LO, V0_HI
8224/// HADD V1_LO, V1_HI
8225///
8226/// Otherwise, the first horizontal binop dag node takes as input the lower
8227/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8228/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8229/// Example:
8230/// HADD V0_LO, V1_LO
8231/// HADD V0_HI, V1_HI
8232///
8233/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8234/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8235/// the upper 128-bits of the result.
8236static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8237 const SDLoc &DL, SelectionDAG &DAG,
8238 unsigned X86Opcode, bool Mode,
8239 bool isUndefLO, bool isUndefHI) {
8240 MVT VT = V0.getSimpleValueType();
8241 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8242 "Invalid nodes in input!");
8243
8244 unsigned NumElts = VT.getVectorNumElements();
8245 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8246 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8247 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8248 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8249 MVT NewVT = V0_LO.getSimpleValueType();
8250
8251 SDValue LO = DAG.getUNDEF(NewVT);
8252 SDValue HI = DAG.getUNDEF(NewVT);
8253
8254 if (Mode) {
8255 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8256 if (!isUndefLO && !V0->isUndef())
8257 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8258 if (!isUndefHI && !V1->isUndef())
8259 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8260 } else {
8261 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8262 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8263 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8264
8265 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8266 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8267 }
8268
8269 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8270}
8271
8272/// Returns true iff \p BV builds a vector with the result equivalent to
8273/// the result of ADDSUB/SUBADD operation.
8274/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8275/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8276/// \p Opnd0 and \p Opnd1.
8278 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8279 SDValue &Opnd0, SDValue &Opnd1,
8280 unsigned &NumExtracts, bool &IsSubAdd,
8281 bool &HasAllowContract) {
8282 using namespace SDPatternMatch;
8283
8284 MVT VT = BV->getSimpleValueType(0);
8285 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8286 return false;
8287
8288 unsigned NumElts = VT.getVectorNumElements();
8289 SDValue InVec0 = DAG.getUNDEF(VT);
8290 SDValue InVec1 = DAG.getUNDEF(VT);
8291
8292 NumExtracts = 0;
8293 HasAllowContract = NumElts != 0;
8294
8295 // Odd-numbered elements in the input build vector are obtained from
8296 // adding/subtracting two integer/float elements.
8297 // Even-numbered elements in the input build vector are obtained from
8298 // subtracting/adding two integer/float elements.
8299 unsigned Opc[2] = {0, 0};
8300 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8301 SDValue Op = BV->getOperand(i);
8302
8303 // Skip 'undef' values.
8304 unsigned Opcode = Op.getOpcode();
8305 if (Opcode == ISD::UNDEF)
8306 continue;
8307
8308 // Early exit if we found an unexpected opcode.
8309 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8310 return false;
8311
8312 SDValue Op0 = Op.getOperand(0);
8313 SDValue Op1 = Op.getOperand(1);
8314
8315 // Try to match the following pattern:
8316 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8317 // Early exit if we cannot match that sequence.
8318 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8319 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8320 return false;
8321
8322 // We found a valid add/sub node, make sure its the same opcode as previous
8323 // elements for this parity.
8324 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8325 return false;
8326 Opc[i % 2] = Opcode;
8327
8328 // Update InVec0 and InVec1.
8329 if (InVec0.isUndef())
8330 InVec0 = Op0.getOperand(0);
8331 if (InVec1.isUndef())
8332 InVec1 = Op1.getOperand(0);
8333
8334 // Make sure that operands in input to each add/sub node always
8335 // come from a same pair of vectors.
8336 if (InVec0 != Op0.getOperand(0)) {
8337 if (Opcode == ISD::FSUB)
8338 return false;
8339
8340 // FADD is commutable. Try to commute the operands
8341 // and then test again.
8342 std::swap(Op0, Op1);
8343 if (InVec0 != Op0.getOperand(0))
8344 return false;
8345 }
8346
8347 if (InVec1 != Op1.getOperand(0))
8348 return false;
8349
8350 // Increment the number of extractions done.
8351 ++NumExtracts;
8352 HasAllowContract &= Op->getFlags().hasAllowContract();
8353 }
8354
8355 // Ensure we have found an opcode for both parities and that they are
8356 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8357 // inputs are undef.
8358 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8359 InVec0.isUndef() || InVec1.isUndef())
8360 return false;
8361
8362 IsSubAdd = Opc[0] == ISD::FADD;
8363
8364 Opnd0 = InVec0;
8365 Opnd1 = InVec1;
8366 return true;
8367}
8368
8369/// Returns true if is possible to fold MUL and an idiom that has already been
8370/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8371/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8372/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8373///
8374/// Prior to calling this function it should be known that there is some
8375/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8376/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8377/// before replacement of such SDNode with ADDSUB operation. Thus the number
8378/// of \p Opnd0 uses is expected to be equal to 2.
8379/// For example, this function may be called for the following IR:
8380/// %AB = fmul fast <2 x double> %A, %B
8381/// %Sub = fsub fast <2 x double> %AB, %C
8382/// %Add = fadd fast <2 x double> %AB, %C
8383/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8384/// <2 x i32> <i32 0, i32 3>
8385/// There is a def for %Addsub here, which potentially can be replaced by
8386/// X86ISD::ADDSUB operation:
8387/// %Addsub = X86ISD::ADDSUB %AB, %C
8388/// and such ADDSUB can further be replaced with FMADDSUB:
8389/// %Addsub = FMADDSUB %A, %B, %C.
8390///
8391/// The main reason why this method is called before the replacement of the
8392/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8393/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8394/// FMADDSUB is.
8395static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8396 SelectionDAG &DAG, SDValue &Opnd0,
8397 SDValue &Opnd1, SDValue &Opnd2,
8398 unsigned ExpectedUses,
8399 bool AllowSubAddOrAddSubContract) {
8400 if (Opnd0.getOpcode() != ISD::FMUL ||
8401 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8402 return false;
8403
8404 // FIXME: These checks must match the similar ones in
8405 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8406 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8407 // or MUL + ADDSUB to FMADDSUB.
8408 const TargetOptions &Options = DAG.getTarget().Options;
8409 bool AllowFusion =
8410 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8411 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8412 if (!AllowFusion)
8413 return false;
8414
8415 Opnd2 = Opnd1;
8416 Opnd1 = Opnd0.getOperand(1);
8417 Opnd0 = Opnd0.getOperand(0);
8418
8419 return true;
8420}
8421
8422/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8423/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8424/// X86ISD::FMSUBADD node.
8426 const SDLoc &DL,
8427 const X86Subtarget &Subtarget,
8428 SelectionDAG &DAG) {
8429 SDValue Opnd0, Opnd1;
8430 unsigned NumExtracts;
8431 bool IsSubAdd;
8432 bool HasAllowContract;
8433 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8434 HasAllowContract))
8435 return SDValue();
8436
8437 MVT VT = BV->getSimpleValueType(0);
8438
8439 // Try to generate X86ISD::FMADDSUB node here.
8440 SDValue Opnd2;
8441 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8442 HasAllowContract)) {
8443 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8444 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8445 }
8446
8447 // We only support ADDSUB.
8448 if (IsSubAdd)
8449 return SDValue();
8450
8451 // There are no known X86 targets with 512-bit ADDSUB instructions!
8452 // Convert to blend(fsub,fadd).
8453 if (VT.is512BitVector()) {
8454 SmallVector<int> Mask;
8455 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8456 Mask.push_back(I);
8457 Mask.push_back(I + E + 1);
8458 }
8459 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8460 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8461 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8462 }
8463
8464 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8465}
8466
8468 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8469 // Initialize outputs to known values.
8470 MVT VT = BV->getSimpleValueType(0);
8471 HOpcode = ISD::DELETED_NODE;
8472 V0 = DAG.getUNDEF(VT);
8473 V1 = DAG.getUNDEF(VT);
8474
8475 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8476 // half of the result is calculated independently from the 128-bit halves of
8477 // the inputs, so that makes the index-checking logic below more complicated.
8478 unsigned NumElts = VT.getVectorNumElements();
8479 unsigned GenericOpcode = ISD::DELETED_NODE;
8480 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8481 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8482 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8483 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8484 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8485 // Ignore undef elements.
8486 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8487 if (Op.isUndef())
8488 continue;
8489
8490 // If there's an opcode mismatch, we're done.
8491 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8492 return false;
8493
8494 // Initialize horizontal opcode.
8495 if (HOpcode == ISD::DELETED_NODE) {
8496 GenericOpcode = Op.getOpcode();
8497 switch (GenericOpcode) {
8498 // clang-format off
8499 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8500 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8501 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8502 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8503 default: return false;
8504 // clang-format on
8505 }
8506 }
8507
8508 SDValue Op0 = Op.getOperand(0);
8509 SDValue Op1 = Op.getOperand(1);
8510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8512 Op0.getOperand(0) != Op1.getOperand(0) ||
8514 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8515 return false;
8516
8517 // The source vector is chosen based on which 64-bit half of the
8518 // destination vector is being calculated.
8519 if (j < NumEltsIn64Bits) {
8520 if (V0.isUndef())
8521 V0 = Op0.getOperand(0);
8522 } else {
8523 if (V1.isUndef())
8524 V1 = Op0.getOperand(0);
8525 }
8526
8527 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8528 if (SourceVec != Op0.getOperand(0))
8529 return false;
8530
8531 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8532 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8533 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8534 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8535 (j % NumEltsIn64Bits) * 2;
8536 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8537 continue;
8538
8539 // If this is not a commutative op, this does not match.
8540 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8541 return false;
8542
8543 // Addition is commutative, so try swapping the extract indexes.
8544 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8545 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8546 continue;
8547
8548 // Extract indexes do not match horizontal requirement.
8549 return false;
8550 }
8551 }
8552 // We matched. Opcode and operands are returned by reference as arguments.
8553 return true;
8554}
8555
8557 const SDLoc &DL, SelectionDAG &DAG,
8558 unsigned HOpcode, SDValue V0, SDValue V1) {
8559 // If either input vector is not the same size as the build vector,
8560 // extract/insert the low bits to the correct size.
8561 // This is free (examples: zmm --> xmm, xmm --> ymm).
8562 MVT VT = BV->getSimpleValueType(0);
8563 unsigned Width = VT.getSizeInBits();
8564 if (V0.getValueSizeInBits() > Width)
8565 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8566 else if (V0.getValueSizeInBits() < Width)
8567 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8568
8569 if (V1.getValueSizeInBits() > Width)
8570 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8571 else if (V1.getValueSizeInBits() < Width)
8572 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8573
8574 unsigned NumElts = VT.getVectorNumElements();
8575 APInt DemandedElts = APInt::getAllOnes(NumElts);
8576 for (unsigned i = 0; i != NumElts; ++i)
8577 if (BV->getOperand(i).isUndef())
8578 DemandedElts.clearBit(i);
8579
8580 // If we don't need the upper xmm, then perform as a xmm hop.
8581 unsigned HalfNumElts = NumElts / 2;
8582 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8583 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8584 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8585 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8586 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8587 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8588 }
8589
8590 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8591}
8592
8593/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8595 const X86Subtarget &Subtarget,
8596 SelectionDAG &DAG) {
8597 // We need at least 2 non-undef elements to make this worthwhile by default.
8598 unsigned NumNonUndefs =
8599 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8600 if (NumNonUndefs < 2)
8601 return SDValue();
8602
8603 // There are 4 sets of horizontal math operations distinguished by type:
8604 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8605 // subtarget feature. Try to match those "native" patterns first.
8606 MVT VT = BV->getSimpleValueType(0);
8607 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8608 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8609 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8610 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8611 unsigned HOpcode;
8612 SDValue V0, V1;
8613 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8614 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8615 }
8616
8617 // Try harder to match 256-bit ops by using extract/concat.
8618 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8619 return SDValue();
8620
8621 // Count the number of UNDEF operands in the build_vector in input.
8622 unsigned NumElts = VT.getVectorNumElements();
8623 unsigned Half = NumElts / 2;
8624 unsigned NumUndefsLO = 0;
8625 unsigned NumUndefsHI = 0;
8626 for (unsigned i = 0, e = Half; i != e; ++i)
8627 if (BV->getOperand(i)->isUndef())
8628 NumUndefsLO++;
8629
8630 for (unsigned i = Half, e = NumElts; i != e; ++i)
8631 if (BV->getOperand(i)->isUndef())
8632 NumUndefsHI++;
8633
8634 SDValue InVec0, InVec1;
8635 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8636 SDValue InVec2, InVec3;
8637 unsigned X86Opcode;
8638 bool CanFold = true;
8639
8640 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8641 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8642 InVec3) &&
8643 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8644 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8645 X86Opcode = X86ISD::HADD;
8646 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8647 InVec1) &&
8648 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8649 InVec3) &&
8650 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8651 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8652 X86Opcode = X86ISD::HSUB;
8653 else
8654 CanFold = false;
8655
8656 if (CanFold) {
8657 // Do not try to expand this build_vector into a pair of horizontal
8658 // add/sub if we can emit a pair of scalar add/sub.
8659 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8660 return SDValue();
8661
8662 // Convert this build_vector into a pair of horizontal binops followed by
8663 // a concat vector. We must adjust the outputs from the partial horizontal
8664 // matching calls above to account for undefined vector halves.
8665 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8666 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8667 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8668 bool isUndefLO = NumUndefsLO == Half;
8669 bool isUndefHI = NumUndefsHI == Half;
8670 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8671 isUndefHI);
8672 }
8673 }
8674
8675 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8676 VT == MVT::v16i16) {
8677 unsigned X86Opcode;
8678 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8679 InVec1))
8680 X86Opcode = X86ISD::HADD;
8681 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8682 InVec1))
8683 X86Opcode = X86ISD::HSUB;
8684 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8685 InVec1))
8686 X86Opcode = X86ISD::FHADD;
8687 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8688 InVec1))
8689 X86Opcode = X86ISD::FHSUB;
8690 else
8691 return SDValue();
8692
8693 // Don't try to expand this build_vector into a pair of horizontal add/sub
8694 // if we can simply emit a pair of scalar add/sub.
8695 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8696 return SDValue();
8697
8698 // Convert this build_vector into two horizontal add/sub followed by
8699 // a concat vector.
8700 bool isUndefLO = NumUndefsLO == Half;
8701 bool isUndefHI = NumUndefsHI == Half;
8702 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8703 isUndefLO, isUndefHI);
8704 }
8705
8706 return SDValue();
8707}
8708
8709static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8710 SelectionDAG &DAG);
8711
8712/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8713/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8714/// just apply the bit to the vectors.
8715/// NOTE: Its not in our interest to start make a general purpose vectorizer
8716/// from this, but enough scalar bit operations are created from the later
8717/// legalization + scalarization stages to need basic support.
8719 const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG) {
8721 MVT VT = Op->getSimpleValueType(0);
8722 unsigned NumElems = VT.getVectorNumElements();
8723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8724
8725 // Check that all elements have the same opcode.
8726 // TODO: Should we allow UNDEFS and if so how many?
8727 unsigned Opcode = Op->getOperand(0).getOpcode();
8728 for (unsigned i = 1; i < NumElems; ++i)
8729 if (Opcode != Op->getOperand(i).getOpcode())
8730 return SDValue();
8731
8732 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8733 bool IsShift = false;
8734 switch (Opcode) {
8735 default:
8736 return SDValue();
8737 case ISD::SHL:
8738 case ISD::SRL:
8739 case ISD::SRA:
8740 IsShift = true;
8741 break;
8742 case ISD::AND:
8743 case ISD::XOR:
8744 case ISD::OR:
8745 // Don't do this if the buildvector is a splat - we'd replace one
8746 // constant with an entire vector.
8747 if (Op->getSplatValue())
8748 return SDValue();
8749 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8750 return SDValue();
8751 break;
8752 }
8753
8754 SmallVector<SDValue, 4> LHSElts, RHSElts;
8755 for (SDValue Elt : Op->ops()) {
8756 SDValue LHS = Elt.getOperand(0);
8757 SDValue RHS = Elt.getOperand(1);
8758
8759 // We expect the canonicalized RHS operand to be the constant.
8761 return SDValue();
8762
8763 // Extend shift amounts.
8764 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8765 if (!IsShift)
8766 return SDValue();
8767 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8768 }
8769
8770 LHSElts.push_back(LHS);
8771 RHSElts.push_back(RHS);
8772 }
8773
8774 // Limit to shifts by uniform immediates.
8775 // TODO: Only accept vXi8/vXi64 special cases?
8776 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8777 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8778 return SDValue();
8779
8780 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8781 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8782 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8783
8784 if (!IsShift)
8785 return Res;
8786
8787 // Immediately lower the shift to ensure the constant build vector doesn't
8788 // get converted to a constant pool before the shift is lowered.
8789 return LowerShift(Res, Subtarget, DAG);
8790}
8791
8792static bool isShuffleFoldableLoad(SDValue);
8793
8794/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8795/// representing a blend.
8797 X86Subtarget const &Subtarget,
8798 SelectionDAG &DAG) {
8799 MVT VT = BVOp->getSimpleValueType(0u);
8800
8801 if (VT != MVT::v4f64)
8802 return SDValue();
8803
8804 // Collect unique operands.
8805 auto UniqueOps = SmallSet<SDValue, 16u>();
8806 for (SDValue Op : BVOp->ops()) {
8807 if (isIntOrFPConstant(Op) || Op.isUndef())
8808 return SDValue();
8809 UniqueOps.insert(Op);
8810 }
8811
8812 // Candidate BUILD_VECTOR must have 2 unique operands.
8813 if (UniqueOps.size() != 2u)
8814 return SDValue();
8815
8816 SDValue Op0 = BVOp->getOperand(0u);
8817 UniqueOps.erase(Op0);
8818 SDValue Op1 = *UniqueOps.begin();
8819
8820 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8821 isShuffleFoldableLoad(Op1)) {
8822 // Create shuffle mask.
8823 auto const NumElems = VT.getVectorNumElements();
8824 SmallVector<int, 16u> Mask(NumElems);
8825 for (auto I = 0u; I < NumElems; ++I) {
8826 SDValue Op = BVOp->getOperand(I);
8827 Mask[I] = Op == Op0 ? I : I + NumElems;
8828 }
8829 // Create shuffle of splats.
8830 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8831 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8832 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8833 }
8834
8835 return SDValue();
8836}
8837
8838/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8839/// functionality to do this, so it's all zeros, all ones, or some derivation
8840/// that is cheap to calculate.
8842 SelectionDAG &DAG,
8843 const X86Subtarget &Subtarget) {
8844 MVT VT = Op.getSimpleValueType();
8845
8846 // Vectors containing all zeros can be matched by pxor and xorps.
8847 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8848 return Op;
8849
8850 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8851 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8852 // vpcmpeqd on 256-bit vectors.
8853 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8854 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8855 return Op;
8856
8857 return getOnesVector(VT, DAG, DL);
8858 }
8859
8860 return SDValue();
8861}
8862
8863/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8864/// from a vector of source values and a vector of extraction indices.
8865/// The vectors might be manipulated to match the type of the permute op.
8866static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8867 const SDLoc &DL, SelectionDAG &DAG,
8868 const X86Subtarget &Subtarget) {
8869 MVT ShuffleVT = VT;
8870 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8871 unsigned NumElts = VT.getVectorNumElements();
8872 unsigned SizeInBits = VT.getSizeInBits();
8873
8874 // Adjust IndicesVec to match VT size.
8875 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8876 "Illegal variable permute mask size");
8877 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8878 // Narrow/widen the indices vector to the correct size.
8879 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8880 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8881 NumElts * VT.getScalarSizeInBits());
8882 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8883 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8884 SDLoc(IndicesVec), SizeInBits);
8885 // Zero-extend the index elements within the vector.
8886 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8887 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8888 IndicesVT, IndicesVec);
8889 }
8890 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8891
8892 // Handle SrcVec that don't match VT type.
8893 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8894 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8895 // Handle larger SrcVec by treating it as a larger permute.
8896 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8897 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8898 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8899 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8900 Subtarget, DAG, SDLoc(IndicesVec));
8901 SDValue NewSrcVec =
8902 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8903 if (NewSrcVec)
8904 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8905 return SDValue();
8906 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8907 // Widen smaller SrcVec to match VT.
8908 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8909 } else
8910 return SDValue();
8911 }
8912
8913 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8914 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8915 EVT SrcVT = Idx.getValueType();
8916 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8917 uint64_t IndexScale = 0;
8918 uint64_t IndexOffset = 0;
8919
8920 // If we're scaling a smaller permute op, then we need to repeat the
8921 // indices, scaling and offsetting them as well.
8922 // e.g. v4i32 -> v16i8 (Scale = 4)
8923 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8924 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8925 for (uint64_t i = 0; i != Scale; ++i) {
8926 IndexScale |= Scale << (i * NumDstBits);
8927 IndexOffset |= i << (i * NumDstBits);
8928 }
8929
8930 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8931 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8932 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8933 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8934 return Idx;
8935 };
8936
8937 unsigned Opcode = 0;
8938 switch (VT.SimpleTy) {
8939 default:
8940 break;
8941 case MVT::v16i8:
8942 if (Subtarget.hasSSSE3())
8943 Opcode = X86ISD::PSHUFB;
8944 break;
8945 case MVT::v8i16:
8946 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8947 Opcode = X86ISD::VPERMV;
8948 else if (Subtarget.hasSSSE3()) {
8949 Opcode = X86ISD::PSHUFB;
8950 ShuffleVT = MVT::v16i8;
8951 }
8952 break;
8953 case MVT::v4f32:
8954 case MVT::v4i32:
8955 if (Subtarget.hasAVX()) {
8956 Opcode = X86ISD::VPERMILPV;
8957 ShuffleVT = MVT::v4f32;
8958 } else if (Subtarget.hasSSSE3()) {
8959 Opcode = X86ISD::PSHUFB;
8960 ShuffleVT = MVT::v16i8;
8961 }
8962 break;
8963 case MVT::v2f64:
8964 case MVT::v2i64:
8965 if (Subtarget.hasAVX()) {
8966 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8967 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8968 Opcode = X86ISD::VPERMILPV;
8969 ShuffleVT = MVT::v2f64;
8970 } else if (Subtarget.hasSSE41()) {
8971 // SSE41 can compare v2i64 - select between indices 0 and 1.
8972 return DAG.getSelectCC(
8973 DL, IndicesVec,
8974 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8975 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8976 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8978 }
8979 break;
8980 case MVT::v32i8:
8981 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8982 Opcode = X86ISD::VPERMV;
8983 else if (Subtarget.hasXOP()) {
8984 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8985 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8986 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8987 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8988 return DAG.getNode(
8990 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8991 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8992 } else if (Subtarget.hasAVX()) {
8993 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8994 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8995 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8996 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8997 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8999 // Permute Lo and Hi and then select based on index range.
9000 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9001 // care about the bit[7] as its just an index vector.
9002 SDValue Idx = Ops[2];
9003 EVT VT = Idx.getValueType();
9004 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9005 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9006 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9008 };
9009 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9010 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9011 PSHUFBBuilder);
9012 }
9013 break;
9014 case MVT::v16i16:
9015 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9016 Opcode = X86ISD::VPERMV;
9017 else if (Subtarget.hasAVX()) {
9018 // Scale to v32i8 and perform as v32i8.
9019 IndicesVec = ScaleIndices(IndicesVec, 2);
9020 return DAG.getBitcast(
9022 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9023 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9024 }
9025 break;
9026 case MVT::v8f32:
9027 case MVT::v8i32:
9028 if (Subtarget.hasAVX2())
9029 Opcode = X86ISD::VPERMV;
9030 else if (Subtarget.hasAVX()) {
9031 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9032 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9033 {0, 1, 2, 3, 0, 1, 2, 3});
9034 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9035 {4, 5, 6, 7, 4, 5, 6, 7});
9036 if (Subtarget.hasXOP())
9037 return DAG.getBitcast(
9038 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9039 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9040 // Permute Lo and Hi and then select based on index range.
9041 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9042 SDValue Res = DAG.getSelectCC(
9043 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9044 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9045 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9047 return DAG.getBitcast(VT, Res);
9048 }
9049 break;
9050 case MVT::v4i64:
9051 case MVT::v4f64:
9052 if (Subtarget.hasAVX512()) {
9053 if (!Subtarget.hasVLX()) {
9054 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9055 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9056 SDLoc(SrcVec));
9057 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9058 DAG, SDLoc(IndicesVec));
9059 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9060 DAG, Subtarget);
9061 return extract256BitVector(Res, 0, DAG, DL);
9062 }
9063 Opcode = X86ISD::VPERMV;
9064 } else if (Subtarget.hasAVX()) {
9065 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9066 SDValue LoLo =
9067 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9068 SDValue HiHi =
9069 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9070 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9071 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9072 if (Subtarget.hasXOP())
9073 return DAG.getBitcast(
9074 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9075 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9076 // Permute Lo and Hi and then select based on index range.
9077 // This works as VPERMILPD only uses index bit[1] to permute elements.
9078 SDValue Res = DAG.getSelectCC(
9079 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9080 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9081 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9083 return DAG.getBitcast(VT, Res);
9084 }
9085 break;
9086 case MVT::v64i8:
9087 if (Subtarget.hasVBMI())
9088 Opcode = X86ISD::VPERMV;
9089 break;
9090 case MVT::v32i16:
9091 if (Subtarget.hasBWI())
9092 Opcode = X86ISD::VPERMV;
9093 break;
9094 case MVT::v16f32:
9095 case MVT::v16i32:
9096 case MVT::v8f64:
9097 case MVT::v8i64:
9098 if (Subtarget.hasAVX512())
9099 Opcode = X86ISD::VPERMV;
9100 break;
9101 }
9102 if (!Opcode)
9103 return SDValue();
9104
9105 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9106 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9107 "Illegal variable permute shuffle type");
9108
9109 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9110 if (Scale > 1)
9111 IndicesVec = ScaleIndices(IndicesVec, Scale);
9112
9113 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9114 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9115
9116 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9117 SDValue Res = Opcode == X86ISD::VPERMV
9118 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9119 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9120 return DAG.getBitcast(VT, Res);
9121}
9122
9123// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9124// reasoned to be a permutation of a vector by indices in a non-constant vector.
9125// (build_vector (extract_elt V, (extract_elt I, 0)),
9126// (extract_elt V, (extract_elt I, 1)),
9127// ...
9128// ->
9129// (vpermv I, V)
9130//
9131// TODO: Handle undefs
9132// TODO: Utilize pshufb and zero mask blending to support more efficient
9133// construction of vectors with constant-0 elements.
9134static SDValue
9136 SelectionDAG &DAG,
9137 const X86Subtarget &Subtarget) {
9138 SDValue SrcVec, IndicesVec;
9139
9140 auto PeekThroughFreeze = [](SDValue N) {
9141 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9142 return N->getOperand(0);
9143 return N;
9144 };
9145 // Check for a match of the permute source vector and permute index elements.
9146 // This is done by checking that the i-th build_vector operand is of the form:
9147 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9148 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9149 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9150 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9151 return SDValue();
9152
9153 // If this is the first extract encountered in V, set the source vector,
9154 // otherwise verify the extract is from the previously defined source
9155 // vector.
9156 if (!SrcVec)
9157 SrcVec = Op.getOperand(0);
9158 else if (SrcVec != Op.getOperand(0))
9159 return SDValue();
9160 SDValue ExtractedIndex = Op->getOperand(1);
9161 // Peek through extends.
9162 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9163 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9164 ExtractedIndex = ExtractedIndex.getOperand(0);
9165 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9166 return SDValue();
9167
9168 // If this is the first extract from the index vector candidate, set the
9169 // indices vector, otherwise verify the extract is from the previously
9170 // defined indices vector.
9171 if (!IndicesVec)
9172 IndicesVec = ExtractedIndex.getOperand(0);
9173 else if (IndicesVec != ExtractedIndex.getOperand(0))
9174 return SDValue();
9175
9176 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9177 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9178 return SDValue();
9179 }
9180
9181 MVT VT = V.getSimpleValueType();
9182 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9183}
9184
9185SDValue
9186X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9187 SDLoc dl(Op);
9188
9189 MVT VT = Op.getSimpleValueType();
9190 MVT EltVT = VT.getVectorElementType();
9191 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9192 unsigned NumElems = Op.getNumOperands();
9193
9194 // Generate vectors for predicate vectors.
9195 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9196 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9197
9198 if (VT.getVectorElementType() == MVT::bf16 &&
9199 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9200 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9201
9202 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9203 return VectorCst;
9204
9205 unsigned EVTBits = EltVT.getSizeInBits();
9206 APInt UndefMask = APInt::getZero(NumElems);
9207 APInt FrozenUndefMask = APInt::getZero(NumElems);
9208 APInt ZeroMask = APInt::getZero(NumElems);
9209 APInt NonZeroMask = APInt::getZero(NumElems);
9210 bool IsAllConstants = true;
9211 bool OneUseFrozenUndefs = true;
9212 SmallSet<SDValue, 8> Values;
9213 unsigned NumConstants = NumElems;
9214 for (unsigned i = 0; i < NumElems; ++i) {
9215 SDValue Elt = Op.getOperand(i);
9216 if (Elt.isUndef()) {
9217 UndefMask.setBit(i);
9218 continue;
9219 }
9220 if (ISD::isFreezeUndef(Elt.getNode())) {
9221 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9222 FrozenUndefMask.setBit(i);
9223 continue;
9224 }
9225 Values.insert(Elt);
9226 if (!isIntOrFPConstant(Elt)) {
9227 IsAllConstants = false;
9228 NumConstants--;
9229 }
9230 if (X86::isZeroNode(Elt)) {
9231 ZeroMask.setBit(i);
9232 } else {
9233 NonZeroMask.setBit(i);
9234 }
9235 }
9236
9237 // All undef vector. Return an UNDEF.
9238 if (UndefMask.isAllOnes())
9239 return DAG.getUNDEF(VT);
9240
9241 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9242 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9243 return DAG.getFreeze(DAG.getUNDEF(VT));
9244
9245 // All undef/freeze(undef)/zero vector. Return a zero vector.
9246 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9247 return getZeroVector(VT, Subtarget, DAG, dl);
9248
9249 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9250 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9251 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9252 // and blend the FREEZE-UNDEF operands back in.
9253 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9254 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9255 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9256 SmallVector<int, 16> BlendMask(NumElems, -1);
9257 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9258 for (unsigned i = 0; i < NumElems; ++i) {
9259 if (UndefMask[i]) {
9260 BlendMask[i] = -1;
9261 continue;
9262 }
9263 BlendMask[i] = i;
9264 if (!FrozenUndefMask[i])
9265 Elts[i] = Op.getOperand(i);
9266 else
9267 BlendMask[i] += NumElems;
9268 }
9269 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9270 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9271 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9272 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9273 }
9274
9275 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9276
9277 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9278 // be better off lowering to a smaller build vector and padding with
9279 // undef/zero.
9280 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9282 unsigned UpperElems = NumElems / 2;
9283 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9284 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9285 if (NumUpperUndefsOrZeros >= UpperElems) {
9286 if (VT.is512BitVector() &&
9287 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9288 UpperElems = NumElems - (NumElems / 4);
9289 // If freeze(undef) is in any upper elements, force to zero.
9290 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9291 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9292 SDValue NewBV =
9293 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9294 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9295 }
9296 }
9297
9298 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9299 return AddSub;
9300 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9301 return HorizontalOp;
9302 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9303 return Broadcast;
9304 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9305 return BitOp;
9306 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9307 return Blend;
9308
9309 unsigned NumZero = ZeroMask.popcount();
9310 unsigned NumNonZero = NonZeroMask.popcount();
9311
9312 // If we are inserting one variable into a vector of non-zero constants, try
9313 // to avoid loading each constant element as a scalar. Load the constants as a
9314 // vector and then insert the variable scalar element. If insertion is not
9315 // supported, fall back to a shuffle to get the scalar blended with the
9316 // constants. Insertion into a zero vector is handled as a special-case
9317 // somewhere below here.
9318 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9319 FrozenUndefMask.isZero() &&
9322 // Create an all-constant vector. The variable element in the old
9323 // build vector is replaced by undef in the constant vector. Save the
9324 // variable scalar element and its index for use in the insertelement.
9325 LLVMContext &Context = *DAG.getContext();
9326 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9327 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9328 SDValue VarElt;
9329 SDValue InsIndex;
9330 for (unsigned i = 0; i != NumElems; ++i) {
9331 SDValue Elt = Op.getOperand(i);
9332 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9333 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9334 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9335 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9336 else if (!Elt.isUndef()) {
9337 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9338 "Expected one variable element in this vector");
9339 VarElt = Elt;
9340 InsIndex = DAG.getVectorIdxConstant(i, dl);
9341 }
9342 }
9343 Constant *CV = ConstantVector::get(ConstVecOps);
9344 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9345
9346 // The constants we just created may not be legal (eg, floating point). We
9347 // must lower the vector right here because we can not guarantee that we'll
9348 // legalize it before loading it. This is also why we could not just create
9349 // a new build vector here. If the build vector contains illegal constants,
9350 // it could get split back up into a series of insert elements.
9351 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9352 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9353 MachineFunction &MF = DAG.getMachineFunction();
9354 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9355 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9356 unsigned InsertC = InsIndex->getAsZExtVal();
9357 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9358 if (InsertC < NumEltsInLow128Bits)
9359 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9360
9361 // There's no good way to insert into the high elements of a >128-bit
9362 // vector, so use shuffles to avoid an extract/insert sequence.
9363 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9364 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9365 SmallVector<int, 8> ShuffleMask;
9366 unsigned NumElts = VT.getVectorNumElements();
9367 for (unsigned i = 0; i != NumElts; ++i)
9368 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9369 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9370 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9371 }
9372
9373 // Special case for single non-zero, non-undef, element.
9374 if (NumNonZero == 1) {
9375 unsigned Idx = NonZeroMask.countr_zero();
9376 SDValue Item = Op.getOperand(Idx);
9377
9378 // If we have a constant or non-constant insertion into the low element of
9379 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9380 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9381 // depending on what the source datatype is.
9382 if (Idx == 0) {
9383 if (NumZero == 0)
9384 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9385
9386 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9387 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9388 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9389 assert((VT.is128BitVector() || VT.is256BitVector() ||
9390 VT.is512BitVector()) &&
9391 "Expected an SSE value type!");
9392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9393 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9394 // zero vector.
9395 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9396 }
9397
9398 // We can't directly insert an i8 or i16 into a vector, so zero extend
9399 // it to i32 first.
9400 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9401 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9402 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9403 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9404 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9405 return DAG.getBitcast(VT, Item);
9406 }
9407 }
9408
9409 // Is it a vector logical left shift?
9410 if (NumElems == 2 && Idx == 1 &&
9411 X86::isZeroNode(Op.getOperand(0)) &&
9412 !X86::isZeroNode(Op.getOperand(1))) {
9413 unsigned NumBits = VT.getSizeInBits();
9414 return getVShift(true, VT,
9416 VT, Op.getOperand(1)),
9417 NumBits/2, DAG, *this, dl);
9418 }
9419
9420 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9421 return SDValue();
9422
9423 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9424 // is a non-constant being inserted into an element other than the low one,
9425 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9426 // movd/movss) to move this into the low element, then shuffle it into
9427 // place.
9428 if (EVTBits == 32) {
9429 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9430 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9431 }
9432 }
9433
9434 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9435 if (Values.size() == 1) {
9436 if (EVTBits == 32) {
9437 // Instead of a shuffle like this:
9438 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9439 // Check if it's possible to issue this instead.
9440 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9441 unsigned Idx = NonZeroMask.countr_zero();
9442 SDValue Item = Op.getOperand(Idx);
9443 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9444 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9445 }
9446 return SDValue();
9447 }
9448
9449 // A vector full of immediates; various special cases are already
9450 // handled, so this is best done with a single constant-pool load.
9451 if (IsAllConstants)
9452 return SDValue();
9453
9454 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9455 return V;
9456
9457 // See if we can use a vector load to get all of the elements.
9458 {
9459 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9460 if (SDValue LD =
9461 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9462 return LD;
9463 }
9464
9465 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9466 // build_vector and broadcast it.
9467 // TODO: We could probably generalize this more.
9468 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9469 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9470 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9471 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9472 // Make sure all the even/odd operands match.
9473 for (unsigned i = 2; i != NumElems; ++i)
9474 if (Ops[i % 2] != Op.getOperand(i))
9475 return false;
9476 return true;
9477 };
9478 if (CanSplat(Op, NumElems, Ops)) {
9479 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9480 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9481 // Create a new build vector and cast to v2i64/v2f64.
9482 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9483 DAG.getBuildVector(NarrowVT, dl, Ops));
9484 // Broadcast from v2i64/v2f64 and cast to final VT.
9485 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9486 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9487 NewBV));
9488 }
9489 }
9490
9491 // For AVX-length vectors, build the individual 128-bit pieces and use
9492 // shuffles to put them in place.
9493 if (VT.getSizeInBits() > 128) {
9494 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9495
9496 // Build both the lower and upper subvector.
9497 SDValue Lower =
9498 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9500 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9501
9502 // Recreate the wider vector with the lower and upper part.
9503 return concatSubVectors(Lower, Upper, DAG, dl);
9504 }
9505
9506 // Let legalizer expand 2-wide build_vectors.
9507 if (EVTBits == 64) {
9508 if (NumNonZero == 1) {
9509 // One half is zero or undef.
9510 unsigned Idx = NonZeroMask.countr_zero();
9511 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9512 Op.getOperand(Idx));
9513 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9514 }
9515 return SDValue();
9516 }
9517
9518 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9519 if (EVTBits == 8 && NumElems == 16)
9520 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9521 NumZero, DAG, Subtarget))
9522 return V;
9523
9524 if (EltVT == MVT::i16 && NumElems == 8)
9525 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9526 NumZero, DAG, Subtarget))
9527 return V;
9528
9529 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9530 if (EVTBits == 32 && NumElems == 4)
9531 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9532 return V;
9533
9534 // If element VT is == 32 bits, turn it into a number of shuffles.
9535 if (NumElems == 4 && NumZero > 0) {
9536 SmallVector<SDValue, 8> Ops(NumElems);
9537 for (unsigned i = 0; i < 4; ++i) {
9538 bool isZero = !NonZeroMask[i];
9539 if (isZero)
9540 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9541 else
9542 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9543 }
9544
9545 for (unsigned i = 0; i < 2; ++i) {
9546 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9547 default: llvm_unreachable("Unexpected NonZero count");
9548 case 0:
9549 Ops[i] = Ops[i*2]; // Must be a zero vector.
9550 break;
9551 case 1:
9552 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9553 break;
9554 case 2:
9555 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9556 break;
9557 case 3:
9558 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9559 break;
9560 }
9561 }
9562
9563 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9564 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9565 int MaskVec[] = {
9566 Reverse1 ? 1 : 0,
9567 Reverse1 ? 0 : 1,
9568 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9569 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9570 };
9571 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9572 }
9573
9574 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9575
9576 // Check for a build vector from mostly shuffle plus few inserting.
9577 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9578 return Sh;
9579
9580 // For SSE 4.1, use insertps to put the high elements into the low element.
9581 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9583 if (!Op.getOperand(0).isUndef())
9584 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9585 else
9586 Result = DAG.getUNDEF(VT);
9587
9588 for (unsigned i = 1; i < NumElems; ++i) {
9589 if (Op.getOperand(i).isUndef()) continue;
9590 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9591 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9592 }
9593 return Result;
9594 }
9595
9596 // Otherwise, expand into a number of unpckl*, start by extending each of
9597 // our (non-undef) elements to the full vector width with the element in the
9598 // bottom slot of the vector (which generates no code for SSE).
9599 SmallVector<SDValue, 8> Ops(NumElems);
9600 for (unsigned i = 0; i < NumElems; ++i) {
9601 if (!Op.getOperand(i).isUndef())
9602 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9603 else
9604 Ops[i] = DAG.getUNDEF(VT);
9605 }
9606
9607 // Next, we iteratively mix elements, e.g. for v4f32:
9608 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9609 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9610 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9611 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9612 // Generate scaled UNPCKL shuffle mask.
9613 SmallVector<int, 16> Mask;
9614 for(unsigned i = 0; i != Scale; ++i)
9615 Mask.push_back(i);
9616 for (unsigned i = 0; i != Scale; ++i)
9617 Mask.push_back(NumElems+i);
9618 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9619
9620 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9621 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9622 }
9623 return Ops[0];
9624}
9625
9626// 256-bit AVX can use the vinsertf128 instruction
9627// to create 256-bit vectors from two other 128-bit ones.
9628// TODO: Detect subvector broadcast here instead of DAG combine?
9630 SelectionDAG &DAG,
9631 const X86Subtarget &Subtarget) {
9632 MVT ResVT = Op.getSimpleValueType();
9633 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9634 "Value type must be 256-/512-bit wide");
9635
9636 unsigned NumOperands = Op.getNumOperands();
9637 unsigned NumFreezeUndef = 0;
9638 unsigned NumZero = 0;
9639 unsigned NumNonZero = 0;
9640 unsigned NonZeros = 0;
9641 SmallSet<SDValue, 4> Undefs;
9642 for (unsigned i = 0; i != NumOperands; ++i) {
9643 SDValue SubVec = Op.getOperand(i);
9644 if (SubVec.isUndef())
9645 continue;
9646 if (ISD::isFreezeUndef(SubVec.getNode())) {
9647 // If the freeze(undef) has multiple uses then we must fold to zero.
9648 if (SubVec.hasOneUse()) {
9649 ++NumFreezeUndef;
9650 } else {
9651 ++NumZero;
9652 Undefs.insert(SubVec);
9653 }
9654 }
9655 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9656 ++NumZero;
9657 else {
9658 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9659 NonZeros |= 1 << i;
9660 ++NumNonZero;
9661 }
9662 }
9663
9664 // If we have more than 2 non-zeros, build each half separately.
9665 if (NumNonZero > 2) {
9666 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9667 ArrayRef<SDUse> Ops = Op->ops();
9668 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9669 Ops.slice(0, NumOperands/2));
9670 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9671 Ops.slice(NumOperands/2));
9672 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9673 }
9674
9675 // Otherwise, build it up through insert_subvectors.
9676 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9677 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9678 : DAG.getUNDEF(ResVT));
9679
9680 // Replace Undef operands with ZeroVector.
9681 for (SDValue U : Undefs)
9683 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9684
9685 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9686 unsigned NumSubElems = SubVT.getVectorNumElements();
9687 for (unsigned i = 0; i != NumOperands; ++i) {
9688 if ((NonZeros & (1 << i)) == 0)
9689 continue;
9690
9691 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9692 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9693 }
9694
9695 return Vec;
9696}
9697
9698// Returns true if the given node is a type promotion (by concatenating i1
9699// zeros) of the result of a node that already zeros all upper bits of
9700// k-register.
9701// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9703 const X86Subtarget &Subtarget,
9704 SelectionDAG & DAG) {
9705 MVT ResVT = Op.getSimpleValueType();
9706 unsigned NumOperands = Op.getNumOperands();
9707 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9708 "Unexpected number of operands in CONCAT_VECTORS");
9709
9710 uint64_t Zeros = 0;
9711 uint64_t NonZeros = 0;
9712 for (unsigned i = 0; i != NumOperands; ++i) {
9713 SDValue SubVec = Op.getOperand(i);
9714 if (SubVec.isUndef())
9715 continue;
9716 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9717 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9718 Zeros |= (uint64_t)1 << i;
9719 else
9720 NonZeros |= (uint64_t)1 << i;
9721 }
9722
9723 unsigned NumElems = ResVT.getVectorNumElements();
9724
9725 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9726 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9727 // insert_subvector will give us two kshifts.
9728 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9729 Log2_64(NonZeros) != NumOperands - 1) {
9730 unsigned Idx = Log2_64(NonZeros);
9731 SDValue SubVec = Op.getOperand(Idx);
9732 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9733 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9734 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9735 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9736 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9737 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9738 DAG.getVectorIdxConstant(0, dl));
9739 }
9740
9741 // If there are zero or one non-zeros we can handle this very simply.
9742 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9743 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9744 if (!NonZeros)
9745 return Vec;
9746 unsigned Idx = Log2_64(NonZeros);
9747 SDValue SubVec = Op.getOperand(Idx);
9748 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9749 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9750 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9751 }
9752
9753 if (NumOperands > 2) {
9754 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9755 ArrayRef<SDUse> Ops = Op->ops();
9756 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9757 Ops.slice(0, NumOperands / 2));
9758 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9759 Ops.slice(NumOperands / 2));
9760 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9761 }
9762
9763 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9764
9765 if (ResVT.getVectorNumElements() >= 16)
9766 return Op; // The operation is legal with KUNPCK
9767
9768 SDValue Vec =
9769 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9770 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9771 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9772 DAG.getVectorIdxConstant(NumElems / 2, dl));
9773}
9774
9776 const X86Subtarget &Subtarget,
9777 SelectionDAG &DAG) {
9778 SDLoc DL(Op);
9779 MVT VT = Op.getSimpleValueType();
9780 if (VT.getVectorElementType() == MVT::i1)
9781 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9782
9783 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9784 // from two other 128-bit ones.
9785 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9786 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9787 (VT.is512BitVector() &&
9788 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9789 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9790}
9791
9792//===----------------------------------------------------------------------===//
9793// Vector shuffle lowering
9794//
9795// This is an experimental code path for lowering vector shuffles on x86. It is
9796// designed to handle arbitrary vector shuffles and blends, gracefully
9797// degrading performance as necessary. It works hard to recognize idiomatic
9798// shuffles and lower them to optimal instruction patterns without leaving
9799// a framework that allows reasonably efficient handling of all vector shuffle
9800// patterns.
9801//===----------------------------------------------------------------------===//
9802
9803/// Checks whether the vector elements referenced by two shuffle masks are
9804/// equivalent.
9805static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9806 int Idx, int ExpectedIdx) {
9807 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9808 ExpectedIdx < MaskSize && "Out of range element index");
9809 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9810 return false;
9811
9812 EVT VT = Op.getValueType();
9813 EVT ExpectedVT = ExpectedOp.getValueType();
9814
9815 // Sources must be vectors and match the mask's element count.
9816 if (!VT.isVector() || !ExpectedVT.isVector() ||
9817 (int)VT.getVectorNumElements() != MaskSize ||
9818 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9819 return false;
9820
9821 // Exact match.
9822 if (Idx == ExpectedIdx && Op == ExpectedOp)
9823 return true;
9824
9825 switch (Op.getOpcode()) {
9826 case ISD::BUILD_VECTOR:
9827 // If the values are build vectors, we can look through them to find
9828 // equivalent inputs that make the shuffles equivalent.
9829 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9830 case ISD::BITCAST: {
9832 EVT SrcVT = Src.getValueType();
9833 if (Op == ExpectedOp && SrcVT.isVector()) {
9834 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9835 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9836 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9837 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9838 Idx / Scale, ExpectedIdx / Scale);
9839 }
9840 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9841 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9842 for (unsigned I = 0; I != Scale; ++I)
9843 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9844 (Idx * Scale) + I,
9845 (ExpectedIdx * Scale) + I))
9846 return false;
9847 return true;
9848 }
9849 }
9850 break;
9851 }
9852 case ISD::VECTOR_SHUFFLE: {
9853 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9854 return Op == ExpectedOp &&
9855 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9856 }
9857 case X86ISD::VBROADCAST:
9859 return Op == ExpectedOp;
9861 if (Op == ExpectedOp) {
9862 auto *MemOp = cast<MemSDNode>(Op);
9863 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9864 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9865 }
9866 break;
9867 case X86ISD::VPERMI: {
9868 if (Op == ExpectedOp) {
9870 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9871 SDValue Src = Op.getOperand(0);
9872 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9873 Mask[ExpectedIdx]);
9874 }
9875 break;
9876 }
9877 case X86ISD::HADD:
9878 case X86ISD::HSUB:
9879 case X86ISD::FHADD:
9880 case X86ISD::FHSUB:
9881 case X86ISD::PACKSS:
9882 case X86ISD::PACKUS:
9883 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9884 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9885 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9886 int NumElts = VT.getVectorNumElements();
9887 int NumLanes = VT.getSizeInBits() / 128;
9888 int NumEltsPerLane = NumElts / NumLanes;
9889 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9890 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9891 bool SameElt =
9892 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9893 return SameLane && SameElt;
9894 }
9895 break;
9896 }
9897
9898 return false;
9899}
9900
9901/// Tiny helper function to identify a no-op mask.
9902///
9903/// This is a somewhat boring predicate function. It checks whether the mask
9904/// array input, which is assumed to be a single-input shuffle mask of the kind
9905/// used by the X86 shuffle instructions (not a fully general
9906/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9907/// in-place shuffle are 'no-op's.
9909 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9910 assert(Mask[i] >= -1 && "Out of bound mask element!");
9911 if (Mask[i] >= 0 && Mask[i] != i)
9912 return false;
9913 }
9914 return true;
9915}
9916
9917/// Test whether there are elements crossing LaneSizeInBits lanes in this
9918/// shuffle mask.
9919///
9920/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9921/// and we routinely test for these.
9922static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9923 unsigned ScalarSizeInBits,
9924 ArrayRef<int> Mask) {
9925 assert(LaneSizeInBits && ScalarSizeInBits &&
9926 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9927 "Illegal shuffle lane size");
9928 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9929 int Size = Mask.size();
9930 for (int i = 0; i < Size; ++i)
9931 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9932 return true;
9933 return false;
9934}
9935
9936/// Test whether there are elements crossing 128-bit lanes in this
9937/// shuffle mask.
9939 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9940}
9941
9942/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9943/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9944/// better support 'repeated mask + lane permute' style shuffles.
9945static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9946 unsigned ScalarSizeInBits,
9947 ArrayRef<int> Mask) {
9948 assert(LaneSizeInBits && ScalarSizeInBits &&
9949 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9950 "Illegal shuffle lane size");
9951 int NumElts = Mask.size();
9952 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9953 int NumLanes = NumElts / NumEltsPerLane;
9954 if (NumLanes > 1) {
9955 for (int i = 0; i != NumLanes; ++i) {
9956 int SrcLane = -1;
9957 for (int j = 0; j != NumEltsPerLane; ++j) {
9958 int M = Mask[(i * NumEltsPerLane) + j];
9959 if (M < 0)
9960 continue;
9961 int Lane = (M % NumElts) / NumEltsPerLane;
9962 if (SrcLane >= 0 && SrcLane != Lane)
9963 return true;
9964 SrcLane = Lane;
9965 }
9966 }
9967 }
9968 return false;
9969}
9970
9971/// Test whether a shuffle mask is equivalent within each sub-lane.
9972///
9973/// This checks a shuffle mask to see if it is performing the same
9974/// lane-relative shuffle in each sub-lane. This trivially implies
9975/// that it is also not lane-crossing. It may however involve a blend from the
9976/// same lane of a second vector.
9977///
9978/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9979/// non-trivial to compute in the face of undef lanes. The representation is
9980/// suitable for use with existing 128-bit shuffles as entries from the second
9981/// vector have been remapped to [LaneSize, 2*LaneSize).
9982static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9983 ArrayRef<int> Mask,
9984 SmallVectorImpl<int> &RepeatedMask) {
9985 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9986 RepeatedMask.assign(LaneSize, -1);
9987 int Size = Mask.size();
9988 for (int i = 0; i < Size; ++i) {
9989 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9990 if (Mask[i] < 0)
9991 continue;
9992 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9993 // This entry crosses lanes, so there is no way to model this shuffle.
9994 return false;
9995
9996 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9997 // Adjust second vector indices to start at LaneSize instead of Size.
9998 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9999 : Mask[i] % LaneSize + LaneSize;
10000 if (RepeatedMask[i % LaneSize] < 0)
10001 // This is the first non-undef entry in this slot of a 128-bit lane.
10002 RepeatedMask[i % LaneSize] = LocalM;
10003 else if (RepeatedMask[i % LaneSize] != LocalM)
10004 // Found a mismatch with the repeated mask.
10005 return false;
10006 }
10007 return true;
10008}
10009
10010/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10011static bool
10013 SmallVectorImpl<int> &RepeatedMask) {
10014 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10015}
10016
10017static bool
10019 SmallVector<int, 32> RepeatedMask;
10020 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10021}
10022
10023/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10024static bool
10026 SmallVectorImpl<int> &RepeatedMask) {
10027 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10028}
10029
10030/// Test whether a target shuffle mask is equivalent within each sub-lane.
10031/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10032static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10033 unsigned EltSizeInBits,
10034 ArrayRef<int> Mask,
10035 SmallVectorImpl<int> &RepeatedMask) {
10036 int LaneSize = LaneSizeInBits / EltSizeInBits;
10037 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10038 int Size = Mask.size();
10039 for (int i = 0; i < Size; ++i) {
10040 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10041 if (Mask[i] == SM_SentinelUndef)
10042 continue;
10043 if (Mask[i] == SM_SentinelZero) {
10044 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10045 return false;
10046 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10047 continue;
10048 }
10049 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10050 // This entry crosses lanes, so there is no way to model this shuffle.
10051 return false;
10052
10053 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10054 // later vector indices to start at multiples of LaneSize instead of Size.
10055 int LaneM = Mask[i] / Size;
10056 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10057 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10058 // This is the first non-undef entry in this slot of a 128-bit lane.
10059 RepeatedMask[i % LaneSize] = LocalM;
10060 else if (RepeatedMask[i % LaneSize] != LocalM)
10061 // Found a mismatch with the repeated mask.
10062 return false;
10063 }
10064 return true;
10065}
10066
10067/// Test whether a target shuffle mask is equivalent within each sub-lane.
10068/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10069static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10070 ArrayRef<int> Mask,
10071 SmallVectorImpl<int> &RepeatedMask) {
10072 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10073 Mask, RepeatedMask);
10074}
10075
10076/// Checks whether a shuffle mask is equivalent to an explicit list of
10077/// arguments.
10078///
10079/// This is a fast way to test a shuffle mask against a fixed pattern:
10080///
10081/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10082///
10083/// It returns true if the mask is exactly as wide as the argument list, and
10084/// each element of the mask is either -1 (signifying undef) or the value given
10085/// in the argument.
10086static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10087 SDValue V1 = SDValue(),
10088 SDValue V2 = SDValue()) {
10089 int Size = Mask.size();
10090 if (Size != (int)ExpectedMask.size())
10091 return false;
10092
10093 for (int i = 0; i < Size; ++i) {
10094 assert(Mask[i] >= -1 && "Out of bound mask element!");
10095 int MaskIdx = Mask[i];
10096 int ExpectedIdx = ExpectedMask[i];
10097 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10098 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10099 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10100 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10101 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10102 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10103 return false;
10104 }
10105 }
10106 return true;
10107}
10108
10109/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10110///
10111/// The masks must be exactly the same width.
10112///
10113/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10114/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10115///
10116/// SM_SentinelZero is accepted as a valid negative index but must match in
10117/// both, or via a known bits test.
10119 ArrayRef<int> ExpectedMask,
10120 const SelectionDAG &DAG,
10121 SDValue V1 = SDValue(),
10122 SDValue V2 = SDValue()) {
10123 int Size = Mask.size();
10124 if (Size != (int)ExpectedMask.size())
10125 return false;
10126 assert(llvm::all_of(ExpectedMask,
10127 [Size](int M) {
10128 return M == SM_SentinelZero ||
10129 isInRange(M, 0, 2 * Size);
10130 }) &&
10131 "Illegal target shuffle mask");
10132
10133 // Check for out-of-range target shuffle mask indices.
10134 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10135 return false;
10136
10137 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10138 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10139 !V1.getValueType().isVector()))
10140 V1 = SDValue();
10141 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10142 !V2.getValueType().isVector()))
10143 V2 = SDValue();
10144
10145 APInt ZeroV1 = APInt::getZero(Size);
10146 APInt ZeroV2 = APInt::getZero(Size);
10147
10148 for (int i = 0; i < Size; ++i) {
10149 int MaskIdx = Mask[i];
10150 int ExpectedIdx = ExpectedMask[i];
10151 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10152 continue;
10153 // If we failed to match an expected SM_SentinelZero then early out.
10154 if (ExpectedIdx < 0)
10155 return false;
10156 if (MaskIdx == SM_SentinelZero) {
10157 // If we need this expected index to be a zero element, then update the
10158 // relevant zero mask and perform the known bits at the end to minimize
10159 // repeated computes.
10160 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10161 if (ExpectedV &&
10162 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10163 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10164 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10165 ZeroMask.setBit(BitIdx);
10166 continue;
10167 }
10168 }
10169 if (MaskIdx >= 0) {
10170 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10171 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10172 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10173 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10174 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10175 continue;
10176 }
10177 return false;
10178 }
10179 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10180 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10181}
10182
10183// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10184// instructions.
10186 const SelectionDAG &DAG) {
10187 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10188 return false;
10189
10190 SmallVector<int, 8> Unpcklwd;
10191 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10192 /* Unary = */ false);
10193 SmallVector<int, 8> Unpckhwd;
10194 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10195 /* Unary = */ false);
10196 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10197 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10198 return IsUnpackwdMask;
10199}
10200
10202 const SelectionDAG &DAG) {
10203 // Create 128-bit vector type based on mask size.
10204 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10205 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10206
10207 // We can't assume a canonical shuffle mask, so try the commuted version too.
10208 SmallVector<int, 4> CommutedMask(Mask);
10210
10211 // Match any of unary/binary or low/high.
10212 for (unsigned i = 0; i != 4; ++i) {
10213 SmallVector<int, 16> UnpackMask;
10214 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10215 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10216 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10217 return true;
10218 }
10219 return false;
10220}
10221
10222/// Return true if a shuffle mask chooses elements identically in its top and
10223/// bottom halves. For example, any splat mask has the same top and bottom
10224/// halves. If an element is undefined in only one half of the mask, the halves
10225/// are not considered identical.
10227 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10228 unsigned HalfSize = Mask.size() / 2;
10229 for (unsigned i = 0; i != HalfSize; ++i) {
10230 if (Mask[i] != Mask[i + HalfSize])
10231 return false;
10232 }
10233 return true;
10234}
10235
10236/// Get a 4-lane 8-bit shuffle immediate for a mask.
10237///
10238/// This helper function produces an 8-bit shuffle immediate corresponding to
10239/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10240/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10241/// example.
10242///
10243/// NB: We rely heavily on "undef" masks preserving the input lane.
10244static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10245 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10246 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10247 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10248 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10249 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10250
10251 // If the mask only uses one non-undef element, then fully 'splat' it to
10252 // improve later broadcast matching.
10253 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10254 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10255
10256 int FirstElt = Mask[FirstIndex];
10257 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10258 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10259
10260 unsigned Imm = 0;
10261 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10262 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10263 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10264 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10265 return Imm;
10266}
10267
10269 SelectionDAG &DAG) {
10270 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10271}
10272
10273// Canonicalize SHUFPD mask to improve chances of further folding.
10274// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10275static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10276 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10277 "Unexpected SHUFPD mask size");
10278 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10279 "Unexpected SHUFPD mask elements");
10280
10281 // If the mask only uses one non-undef element, then fully 'splat' it to
10282 // improve later broadcast matching.
10283 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10284 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10285 "All undef shuffle mask");
10286
10287 int FirstElt = Mask[FirstIndex];
10288 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10289 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10290 unsigned Imm = 0;
10291 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10292 Imm |= FirstElt << I;
10293 return Imm;
10294 }
10295
10296 // Attempt to keep any undef elements in place to improve chances of the
10297 // shuffle becoming a (commutative) blend.
10298 unsigned Imm = 0;
10299 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10300 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10301
10302 return Imm;
10303}
10304
10306 SelectionDAG &DAG) {
10307 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10308}
10309
10310// The Shuffle result is as follow:
10311// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10312// Each Zeroable's element correspond to a particular Mask's element.
10313// As described in computeZeroableShuffleElements function.
10314//
10315// The function looks for a sub-mask that the nonzero elements are in
10316// increasing order. If such sub-mask exist. The function returns true.
10317static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10318 ArrayRef<int> Mask, const EVT &VectorType,
10319 bool &IsZeroSideLeft) {
10320 int NextElement = -1;
10321 // Check if the Mask's nonzero elements are in increasing order.
10322 for (int i = 0, e = Mask.size(); i < e; i++) {
10323 // Checks if the mask's zeros elements are built from only zeros.
10324 assert(Mask[i] >= -1 && "Out of bound mask element!");
10325 if (Mask[i] < 0)
10326 return false;
10327 if (Zeroable[i])
10328 continue;
10329 // Find the lowest non zero element
10330 if (NextElement < 0) {
10331 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10332 IsZeroSideLeft = NextElement != 0;
10333 }
10334 // Exit if the mask's non zero elements are not in increasing order.
10335 if (NextElement != Mask[i])
10336 return false;
10337 NextElement++;
10338 }
10339 return true;
10340}
10341
10342static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10344 const X86Subtarget &Subtarget,
10345 unsigned Depth = 0);
10346
10347/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10349 ArrayRef<int> Mask, SDValue V1,
10350 SDValue V2, const APInt &Zeroable,
10351 const X86Subtarget &Subtarget,
10352 SelectionDAG &DAG) {
10353 int Size = Mask.size();
10354 int LaneSize = 128 / VT.getScalarSizeInBits();
10355 const int NumBytes = VT.getSizeInBits() / 8;
10356 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10357
10358 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10359 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10360 (Subtarget.hasBWI() && VT.is512BitVector()));
10361
10362 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10363 // Sign bit set in i8 mask means zero element.
10364 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10365
10366 SDValue V;
10367 for (int i = 0; i < NumBytes; ++i) {
10368 int M = Mask[i / NumEltBytes];
10369 if (M < 0) {
10370 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10371 continue;
10372 }
10373 if (Zeroable[i / NumEltBytes]) {
10374 PSHUFBMask[i] = ZeroMask;
10375 continue;
10376 }
10377
10378 // We can only use a single input of V1 or V2.
10379 SDValue SrcV = (M >= Size ? V2 : V1);
10380 if (V && V != SrcV)
10381 return SDValue();
10382 V = SrcV;
10383 M %= Size;
10384
10385 // PSHUFB can't cross lanes, ensure this doesn't happen.
10386 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10387 return SDValue();
10388
10389 M = M % LaneSize;
10390 M = M * NumEltBytes + (i % NumEltBytes);
10391 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10392 }
10393 assert(V && "Failed to find a source input");
10394
10395 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10396 return DAG.getBitcast(
10397 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10398 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10399}
10400
10401static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10402 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10403 const SDLoc &dl);
10404
10405// X86 has dedicated shuffle that can be lowered to VEXPAND
10407 SDValue V2, ArrayRef<int> Mask,
10408 const APInt &Zeroable,
10409 const X86Subtarget &Subtarget,
10410 SelectionDAG &DAG) {
10411 bool IsLeftZeroSide = true;
10412 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10413 IsLeftZeroSide))
10414 return SDValue();
10415 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10417 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10418 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10419 unsigned NumElts = VT.getVectorNumElements();
10420 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10421 "Unexpected number of vector elements");
10422 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10423 Subtarget, DAG, DL);
10424 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10425 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10426 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10427}
10428
10429static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10430 unsigned &UnpackOpcode, bool IsUnary,
10431 ArrayRef<int> TargetMask, const SDLoc &DL,
10432 SelectionDAG &DAG,
10433 const X86Subtarget &Subtarget) {
10434 int NumElts = VT.getVectorNumElements();
10435
10436 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10437 for (int i = 0; i != NumElts; i += 2) {
10438 int M1 = TargetMask[i + 0];
10439 int M2 = TargetMask[i + 1];
10440 Undef1 &= (SM_SentinelUndef == M1);
10441 Undef2 &= (SM_SentinelUndef == M2);
10442 Zero1 &= isUndefOrZero(M1);
10443 Zero2 &= isUndefOrZero(M2);
10444 }
10445 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10446 "Zeroable shuffle detected");
10447
10448 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10449 SmallVector<int, 64> Unpckl, Unpckh;
10450 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10451 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10452 (IsUnary ? V1 : V2))) {
10453 UnpackOpcode = X86ISD::UNPCKL;
10454 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10455 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10456 return true;
10457 }
10458
10459 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10460 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10461 (IsUnary ? V1 : V2))) {
10462 UnpackOpcode = X86ISD::UNPCKH;
10463 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10464 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10465 return true;
10466 }
10467
10468 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10469 if (IsUnary && (Zero1 || Zero2)) {
10470 // Don't bother if we can blend instead.
10471 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10472 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10473 return false;
10474
10475 bool MatchLo = true, MatchHi = true;
10476 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10477 int M = TargetMask[i];
10478
10479 // Ignore if the input is known to be zero or the index is undef.
10480 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10481 (M == SM_SentinelUndef))
10482 continue;
10483
10484 MatchLo &= (M == Unpckl[i]);
10485 MatchHi &= (M == Unpckh[i]);
10486 }
10487
10488 if (MatchLo || MatchHi) {
10489 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10490 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10491 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10492 return true;
10493 }
10494 }
10495
10496 // If a binary shuffle, commute and try again.
10497 if (!IsUnary) {
10499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10500 UnpackOpcode = X86ISD::UNPCKL;
10501 std::swap(V1, V2);
10502 return true;
10503 }
10504
10506 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10507 UnpackOpcode = X86ISD::UNPCKH;
10508 std::swap(V1, V2);
10509 return true;
10510 }
10511 }
10512
10513 return false;
10514}
10515
10516// X86 has dedicated unpack instructions that can handle specific blend
10517// operations: UNPCKH and UNPCKL.
10519 SDValue V2, ArrayRef<int> Mask,
10520 SelectionDAG &DAG) {
10521 SmallVector<int, 8> Unpckl;
10522 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10523 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10524 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10525
10526 SmallVector<int, 8> Unpckh;
10527 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10528 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10529 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10530
10531 // Commute and try again.
10533 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10534 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10535
10537 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10538 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10539
10540 return SDValue();
10541}
10542
10543/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10544/// followed by unpack 256-bit.
10546 SDValue V2, ArrayRef<int> Mask,
10547 SelectionDAG &DAG) {
10548 SmallVector<int, 32> Unpckl, Unpckh;
10549 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10550 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10551
10552 unsigned UnpackOpcode;
10553 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10554 UnpackOpcode = X86ISD::UNPCKL;
10555 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10556 UnpackOpcode = X86ISD::UNPCKH;
10557 else
10558 return SDValue();
10559
10560 // This is a "natural" unpack operation (rather than the 128-bit sectored
10561 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10562 // input in order to use the x86 instruction.
10563 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10564 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10565 V1 = DAG.getBitcast(VT, V1);
10566 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10567}
10568
10569// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10570// source into the lower elements and zeroing the upper elements.
10571static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10572 ArrayRef<int> Mask, const APInt &Zeroable,
10573 const X86Subtarget &Subtarget) {
10574 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10575 return false;
10576
10577 unsigned NumElts = Mask.size();
10578 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10579 unsigned MaxScale = 64 / EltSizeInBits;
10580
10581 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10582 unsigned SrcEltBits = EltSizeInBits * Scale;
10583 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10584 continue;
10585 unsigned NumSrcElts = NumElts / Scale;
10586 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10587 continue;
10588 unsigned UpperElts = NumElts - NumSrcElts;
10589 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10590 continue;
10591 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10592 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10593 DstVT = MVT::getIntegerVT(EltSizeInBits);
10594 if ((NumSrcElts * EltSizeInBits) >= 128) {
10595 // ISD::TRUNCATE
10596 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10597 } else {
10598 // X86ISD::VTRUNC
10599 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10600 }
10601 return true;
10602 }
10603
10604 return false;
10605}
10606
10607// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10608// element padding to the final DstVT.
10609static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10610 const X86Subtarget &Subtarget,
10611 SelectionDAG &DAG, bool ZeroUppers) {
10612 MVT SrcVT = Src.getSimpleValueType();
10613 MVT DstSVT = DstVT.getScalarType();
10614 unsigned NumDstElts = DstVT.getVectorNumElements();
10615 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10616 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10617
10618 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10619 return SDValue();
10620
10621 // Perform a direct ISD::TRUNCATE if possible.
10622 if (NumSrcElts == NumDstElts)
10623 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10624
10625 if (NumSrcElts > NumDstElts) {
10626 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10627 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10628 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10629 }
10630
10631 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10632 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10633 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10634 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10635 DstVT.getSizeInBits());
10636 }
10637
10638 // Non-VLX targets must truncate from a 512-bit type, so we need to
10639 // widen, truncate and then possibly extract the original subvector.
10640 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10641 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10642 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10643 }
10644
10645 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10647 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10648 if (DstVT != TruncVT)
10649 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10650 DstVT.getSizeInBits());
10651 return Trunc;
10652}
10653
10654// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10655//
10656// An example is the following:
10657//
10658// t0: ch = EntryToken
10659// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10660// t25: v4i32 = truncate t2
10661// t41: v8i16 = bitcast t25
10662// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10663// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10664// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10665// t18: v2i64 = bitcast t51
10666//
10667// One can just use a single vpmovdw instruction, without avx512vl we need to
10668// use the zmm variant and extract the lower subvector, padding with zeroes.
10669// TODO: Merge with lowerShuffleAsVTRUNC.
10671 SDValue V2, ArrayRef<int> Mask,
10672 const APInt &Zeroable,
10673 const X86Subtarget &Subtarget,
10674 SelectionDAG &DAG) {
10675 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10676 if (!Subtarget.hasAVX512())
10677 return SDValue();
10678
10679 unsigned NumElts = VT.getVectorNumElements();
10680 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10681 unsigned MaxScale = 64 / EltSizeInBits;
10682 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10683 unsigned SrcEltBits = EltSizeInBits * Scale;
10684 unsigned NumSrcElts = NumElts / Scale;
10685 unsigned UpperElts = NumElts - NumSrcElts;
10686 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10687 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10688 continue;
10689
10690 // Attempt to find a matching source truncation, but as a fall back VLX
10691 // cases can use the VPMOV directly.
10692 SDValue Src = peekThroughBitcasts(V1);
10693 if (Src.getOpcode() == ISD::TRUNCATE &&
10694 Src.getScalarValueSizeInBits() == SrcEltBits) {
10695 Src = Src.getOperand(0);
10696 } else if (Subtarget.hasVLX()) {
10697 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10698 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10699 Src = DAG.getBitcast(SrcVT, Src);
10700 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10701 if (Scale == 2 &&
10702 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10703 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10704 return SDValue();
10705 } else
10706 return SDValue();
10707
10708 // VPMOVWB is only available with avx512bw.
10709 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10710 return SDValue();
10711
10712 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10713 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10714 }
10715
10716 return SDValue();
10717}
10718
10719// Attempt to match binary shuffle patterns as a truncate.
10721 SDValue V2, ArrayRef<int> Mask,
10722 const APInt &Zeroable,
10723 const X86Subtarget &Subtarget,
10724 SelectionDAG &DAG) {
10725 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10726 "Unexpected VTRUNC type");
10727 if (!Subtarget.hasAVX512() ||
10728 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10729 return SDValue();
10730
10731 unsigned NumElts = VT.getVectorNumElements();
10732 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10733 unsigned MaxScale = 64 / EltSizeInBits;
10734 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10735 // TODO: Support non-BWI VPMOVWB truncations?
10736 unsigned SrcEltBits = EltSizeInBits * Scale;
10737 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10738 continue;
10739
10740 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10741 // Bail if the V2 elements are undef.
10742 unsigned NumHalfSrcElts = NumElts / Scale;
10743 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10744 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10745 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10746 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10747 continue;
10748
10749 // The elements beyond the truncation must be undef/zero.
10750 unsigned UpperElts = NumElts - NumSrcElts;
10751 if (UpperElts > 0 &&
10752 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10753 continue;
10754 bool UndefUppers =
10755 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10756
10757 // As we're using both sources then we need to concat them together
10758 // and truncate from the double-sized src.
10759 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10760
10761 // For offset truncations, ensure that the concat is cheap.
10762 SDValue Src =
10763 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10764 if (!Src) {
10765 if (Offset)
10766 continue;
10767 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10768 }
10769
10770 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10771 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10772 Src = DAG.getBitcast(SrcVT, Src);
10773
10774 // Shift the offset'd elements into place for the truncation.
10775 // TODO: Use getTargetVShiftByConstNode.
10776 if (Offset)
10777 Src = DAG.getNode(
10778 X86ISD::VSRLI, DL, SrcVT, Src,
10779 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10780
10781 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10782 }
10783 }
10784
10785 return SDValue();
10786}
10787
10788/// Check whether a compaction lowering can be done by dropping even/odd
10789/// elements and compute how many times even/odd elements must be dropped.
10790///
10791/// This handles shuffles which take every Nth element where N is a power of
10792/// two. Example shuffle masks:
10793///
10794/// (even)
10795/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10796/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10797/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10798/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10799/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10800/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10801///
10802/// (odd)
10803/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10804/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10805///
10806/// Any of these lanes can of course be undef.
10807///
10808/// This routine only supports N <= 3.
10809/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10810/// for larger N.
10811///
10812/// \returns N above, or the number of times even/odd elements must be dropped
10813/// if there is such a number. Otherwise returns zero.
10814static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10815 bool IsSingleInput) {
10816 // The modulus for the shuffle vector entries is based on whether this is
10817 // a single input or not.
10818 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10819 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10820 "We should only be called with masks with a power-of-2 size!");
10821
10822 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10823 int Offset = MatchEven ? 0 : 1;
10824
10825 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10826 // and 2^3 simultaneously. This is because we may have ambiguity with
10827 // partially undef inputs.
10828 bool ViableForN[3] = {true, true, true};
10829
10830 for (int i = 0, e = Mask.size(); i < e; ++i) {
10831 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10832 // want.
10833 if (Mask[i] < 0)
10834 continue;
10835
10836 bool IsAnyViable = false;
10837 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10838 if (ViableForN[j]) {
10839 uint64_t N = j + 1;
10840
10841 // The shuffle mask must be equal to (i * 2^N) % M.
10842 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10843 IsAnyViable = true;
10844 else
10845 ViableForN[j] = false;
10846 }
10847 // Early exit if we exhaust the possible powers of two.
10848 if (!IsAnyViable)
10849 break;
10850 }
10851
10852 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10853 if (ViableForN[j])
10854 return j + 1;
10855
10856 // Return 0 as there is no viable power of two.
10857 return 0;
10858}
10859
10860// X86 has dedicated pack instructions that can handle specific truncation
10861// operations: PACKSS and PACKUS.
10862// Checks for compaction shuffle masks if MaxStages > 1.
10863// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10864static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10865 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10866 const SelectionDAG &DAG,
10867 const X86Subtarget &Subtarget,
10868 unsigned MaxStages = 1) {
10869 unsigned NumElts = VT.getVectorNumElements();
10870 unsigned BitSize = VT.getScalarSizeInBits();
10871 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10872 "Illegal maximum compaction");
10873
10874 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10875 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10876 unsigned NumPackedBits = NumSrcBits - BitSize;
10877 N1 = peekThroughBitcasts(N1);
10878 N2 = peekThroughBitcasts(N2);
10879 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10880 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10881 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10882 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10883 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10884 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10885 return false;
10886 if (Subtarget.hasSSE41() || BitSize == 8) {
10887 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10888 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10889 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10890 V1 = N1;
10891 V2 = N2;
10892 SrcVT = PackVT;
10893 PackOpcode = X86ISD::PACKUS;
10894 return true;
10895 }
10896 }
10897 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10898 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10899 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10900 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10901 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10902 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10903 V1 = N1;
10904 V2 = N2;
10905 SrcVT = PackVT;
10906 PackOpcode = X86ISD::PACKSS;
10907 return true;
10908 }
10909 return false;
10910 };
10911
10912 // Attempt to match against wider and wider compaction patterns.
10913 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10914 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10915 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10916
10917 // Try binary shuffle.
10918 SmallVector<int, 32> BinaryMask;
10919 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10920 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10921 if (MatchPACK(V1, V2, PackVT))
10922 return true;
10923
10924 // Try unary shuffle.
10925 SmallVector<int, 32> UnaryMask;
10926 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10927 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10928 if (MatchPACK(V1, V1, PackVT))
10929 return true;
10930 }
10931
10932 return false;
10933}
10934
10936 SDValue V2, ArrayRef<int> Mask,
10937 const X86Subtarget &Subtarget,
10938 SelectionDAG &DAG) {
10939 MVT PackVT;
10940 unsigned PackOpcode;
10941 unsigned SizeBits = VT.getSizeInBits();
10942 unsigned EltBits = VT.getScalarSizeInBits();
10943 unsigned MaxStages = Log2_32(64 / EltBits);
10944 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10945 Subtarget, MaxStages))
10946 return SDValue();
10947
10948 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10949 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10950
10951 // Don't lower multi-stage packs on AVX512, truncation is better.
10952 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10953 return SDValue();
10954
10955 // Pack to the largest type possible:
10956 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10957 unsigned MaxPackBits = 16;
10958 if (CurrentEltBits > 16 &&
10959 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10960 MaxPackBits = 32;
10961
10962 // Repeatedly pack down to the target size.
10963 SDValue Res;
10964 for (unsigned i = 0; i != NumStages; ++i) {
10965 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10966 unsigned NumSrcElts = SizeBits / SrcEltBits;
10967 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10968 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10969 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10970 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10971 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10972 DAG.getBitcast(SrcVT, V2));
10973 V1 = V2 = Res;
10974 CurrentEltBits /= 2;
10975 }
10976 assert(Res && Res.getValueType() == VT &&
10977 "Failed to lower compaction shuffle");
10978 return Res;
10979}
10980
10981/// Try to emit a bitmask instruction for a shuffle.
10982///
10983/// This handles cases where we can model a blend exactly as a bitmask due to
10984/// one of the inputs being zeroable.
10986 SDValue V2, ArrayRef<int> Mask,
10987 const APInt &Zeroable,
10988 const X86Subtarget &Subtarget,
10989 SelectionDAG &DAG) {
10990 MVT MaskVT = VT;
10991 MVT EltVT = VT.getVectorElementType();
10992 SDValue Zero, AllOnes;
10993 // Use f64 if i64 isn't legal.
10994 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10995 EltVT = MVT::f64;
10996 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10997 }
10998
10999 MVT LogicVT = VT;
11000 if (EltVT.isFloatingPoint()) {
11001 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11002 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11003 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11004 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11005 } else {
11006 Zero = DAG.getConstant(0, DL, EltVT);
11007 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11008 }
11009
11010 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11011 SDValue V;
11012 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11013 if (Zeroable[i])
11014 continue;
11015 if (Mask[i] % Size != i)
11016 return SDValue(); // Not a blend.
11017 if (!V)
11018 V = Mask[i] < Size ? V1 : V2;
11019 else if (V != (Mask[i] < Size ? V1 : V2))
11020 return SDValue(); // Can only let one input through the mask.
11021
11022 VMaskOps[i] = AllOnes;
11023 }
11024 if (!V)
11025 return SDValue(); // No non-zeroable elements!
11026
11027 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11028 VMask = DAG.getBitcast(LogicVT, VMask);
11029 V = DAG.getBitcast(LogicVT, V);
11030 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11031 return DAG.getBitcast(VT, And);
11032}
11033
11034/// Try to emit a blend instruction for a shuffle using bit math.
11035///
11036/// This is used as a fallback approach when first class blend instructions are
11037/// unavailable. Currently it is only suitable for integer vectors, but could
11038/// be generalized for floating point vectors if desirable.
11040 SDValue V2, ArrayRef<int> Mask,
11041 SelectionDAG &DAG) {
11042 assert(VT.isInteger() && "Only supports integer vector types!");
11043 MVT EltVT = VT.getVectorElementType();
11044 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11045 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11047 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11048 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11049 return SDValue(); // Shuffled input!
11050 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11051 }
11052
11053 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11054 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11055}
11056
11058 SDValue PreservedSrc,
11059 const X86Subtarget &Subtarget,
11060 SelectionDAG &DAG);
11061
11064 const APInt &Zeroable, bool &ForceV1Zero,
11065 bool &ForceV2Zero, uint64_t &BlendMask) {
11066 bool V1IsZeroOrUndef =
11068 bool V2IsZeroOrUndef =
11070
11071 BlendMask = 0;
11072 ForceV1Zero = false, ForceV2Zero = false;
11073 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11074
11075 int NumElts = Mask.size();
11076 int NumLanes = VT.getSizeInBits() / 128;
11077 int NumEltsPerLane = NumElts / NumLanes;
11078 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11079
11080 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11081 // then ensure the blend mask part for that lane just references that input.
11082 bool ForceWholeLaneMasks =
11083 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11084
11085 // Attempt to generate the binary blend mask. If an input is zero then
11086 // we can use any lane.
11087 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11088 // Keep track of the inputs used per lane.
11089 bool LaneV1InUse = false;
11090 bool LaneV2InUse = false;
11091 uint64_t LaneBlendMask = 0;
11092 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11093 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11094 int M = Mask[Elt];
11095 if (M == SM_SentinelUndef)
11096 continue;
11097 if (M == Elt || (0 <= M && M < NumElts &&
11098 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11099 Mask[Elt] = Elt;
11100 LaneV1InUse = true;
11101 continue;
11102 }
11103 if (M == (Elt + NumElts) ||
11104 (NumElts <= M &&
11105 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11106 LaneBlendMask |= 1ull << LaneElt;
11107 Mask[Elt] = Elt + NumElts;
11108 LaneV2InUse = true;
11109 continue;
11110 }
11111 if (Zeroable[Elt]) {
11112 if (V1IsZeroOrUndef) {
11113 ForceV1Zero = true;
11114 Mask[Elt] = Elt;
11115 LaneV1InUse = true;
11116 continue;
11117 }
11118 if (V2IsZeroOrUndef) {
11119 ForceV2Zero = true;
11120 LaneBlendMask |= 1ull << LaneElt;
11121 Mask[Elt] = Elt + NumElts;
11122 LaneV2InUse = true;
11123 continue;
11124 }
11125 }
11126 return false;
11127 }
11128
11129 // If we only used V2 then splat the lane blend mask to avoid any demanded
11130 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11131 // blend mask bit).
11132 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11133 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11134
11135 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11136 }
11137 return true;
11138}
11139
11140/// Try to emit a blend instruction for a shuffle.
11141///
11142/// This doesn't do any checks for the availability of instructions for blending
11143/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11144/// be matched in the backend with the type given. What it does check for is
11145/// that the shuffle mask is a blend, or convertible into a blend with zero.
11147 SDValue V2, ArrayRef<int> Original,
11148 const APInt &Zeroable,
11149 const X86Subtarget &Subtarget,
11150 SelectionDAG &DAG) {
11151 uint64_t BlendMask = 0;
11152 bool ForceV1Zero = false, ForceV2Zero = false;
11153 SmallVector<int, 64> Mask(Original);
11154 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11155 BlendMask))
11156 return SDValue();
11157
11158 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11159 if (ForceV1Zero)
11160 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11161 if (ForceV2Zero)
11162 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11163
11164 unsigned NumElts = VT.getVectorNumElements();
11165
11166 switch (VT.SimpleTy) {
11167 case MVT::v4i64:
11168 case MVT::v8i32:
11169 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11170 [[fallthrough]];
11171 case MVT::v4f64:
11172 case MVT::v8f32:
11173 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11174 [[fallthrough]];
11175 case MVT::v2f64:
11176 case MVT::v2i64:
11177 case MVT::v4f32:
11178 case MVT::v4i32:
11179 case MVT::v8i16:
11180 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11181 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11182 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11183 case MVT::v16i16: {
11184 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11185 SmallVector<int, 8> RepeatedMask;
11186 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11187 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11188 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11189 BlendMask = 0;
11190 for (int i = 0; i < 8; ++i)
11191 if (RepeatedMask[i] >= 8)
11192 BlendMask |= 1ull << i;
11193 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11194 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11195 }
11196 // Use PBLENDW for lower/upper lanes and then blend lanes.
11197 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11198 // merge to VSELECT where useful.
11199 uint64_t LoMask = BlendMask & 0xFF;
11200 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11201 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11202 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11203 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11204 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11205 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11206 return DAG.getVectorShuffle(
11207 MVT::v16i16, DL, Lo, Hi,
11208 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11209 }
11210 [[fallthrough]];
11211 }
11212 case MVT::v32i8:
11213 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11214 [[fallthrough]];
11215 case MVT::v16i8: {
11216 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11217
11218 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11219 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11220 Subtarget, DAG))
11221 return Masked;
11222
11223 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11224 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11225 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11226 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11227 }
11228
11229 // If we have VPTERNLOG, we can use that as a bit blend.
11230 if (Subtarget.hasVLX())
11231 if (SDValue BitBlend =
11232 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11233 return BitBlend;
11234
11235 // Scale the blend by the number of bytes per element.
11236 int Scale = VT.getScalarSizeInBits() / 8;
11237
11238 // This form of blend is always done on bytes. Compute the byte vector
11239 // type.
11240 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11241
11242 // x86 allows load folding with blendvb from the 2nd source operand. But
11243 // we are still using LLVM select here (see comment below), so that's V1.
11244 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11245 // allow that load-folding possibility.
11246 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11248 std::swap(V1, V2);
11249 }
11250
11251 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11252 // mix of LLVM's code generator and the x86 backend. We tell the code
11253 // generator that boolean values in the elements of an x86 vector register
11254 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11255 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11256 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11257 // of the element (the remaining are ignored) and 0 in that high bit would
11258 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11259 // the LLVM model for boolean values in vector elements gets the relevant
11260 // bit set, it is set backwards and over constrained relative to x86's
11261 // actual model.
11262 SmallVector<SDValue, 32> VSELECTMask;
11263 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11264 for (int j = 0; j < Scale; ++j)
11265 VSELECTMask.push_back(
11266 Mask[i] < 0
11267 ? DAG.getUNDEF(MVT::i8)
11268 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11269
11270 V1 = DAG.getBitcast(BlendVT, V1);
11271 V2 = DAG.getBitcast(BlendVT, V2);
11272 return DAG.getBitcast(
11273 VT,
11274 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11275 V1, V2));
11276 }
11277 case MVT::v16f32:
11278 case MVT::v8f64:
11279 case MVT::v8i64:
11280 case MVT::v16i32:
11281 case MVT::v32i16:
11282 case MVT::v64i8: {
11283 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11284 bool OptForSize = DAG.shouldOptForSize();
11285 if (!OptForSize) {
11286 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11287 Subtarget, DAG))
11288 return Masked;
11289 }
11290
11291 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11292 // masked move.
11293 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11294 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11295 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11296 }
11297 default:
11298 llvm_unreachable("Not a supported integer vector type!");
11299 }
11300}
11301
11302/// Try to lower as a blend of elements from two inputs followed by
11303/// a single-input permutation.
11304///
11305/// This matches the pattern where we can blend elements from two inputs and
11306/// then reduce the shuffle to a single-input permutation.
11308 SDValue V1, SDValue V2,
11309 ArrayRef<int> Mask,
11310 SelectionDAG &DAG,
11311 bool ImmBlends = false) {
11312 // We build up the blend mask while checking whether a blend is a viable way
11313 // to reduce the shuffle.
11314 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11315 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11316
11317 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11318 if (Mask[i] < 0)
11319 continue;
11320
11321 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11322
11323 if (BlendMask[Mask[i] % Size] < 0)
11324 BlendMask[Mask[i] % Size] = Mask[i];
11325 else if (BlendMask[Mask[i] % Size] != Mask[i])
11326 return SDValue(); // Can't blend in the needed input!
11327
11328 PermuteMask[i] = Mask[i] % Size;
11329 }
11330
11331 // If only immediate blends, then bail if the blend mask can't be widened to
11332 // i16.
11333 unsigned EltSize = VT.getScalarSizeInBits();
11334 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11335 return SDValue();
11336
11337 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11338 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11339}
11340
11341/// Try to lower as an unpack of elements from two inputs followed by
11342/// a single-input permutation.
11343///
11344/// This matches the pattern where we can unpack elements from two inputs and
11345/// then reduce the shuffle to a single-input (wider) permutation.
11347 SDValue V1, SDValue V2,
11348 ArrayRef<int> Mask,
11349 SelectionDAG &DAG) {
11350 int NumElts = Mask.size();
11351 int NumLanes = VT.getSizeInBits() / 128;
11352 int NumLaneElts = NumElts / NumLanes;
11353 int NumHalfLaneElts = NumLaneElts / 2;
11354
11355 bool MatchLo = true, MatchHi = true;
11356 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11357
11358 // Determine UNPCKL/UNPCKH type and operand order.
11359 for (int Elt = 0; Elt != NumElts; ++Elt) {
11360 int M = Mask[Elt];
11361 if (M < 0)
11362 continue;
11363
11364 // Normalize the mask value depending on whether it's V1 or V2.
11365 int NormM = M;
11366 SDValue &Op = Ops[Elt & 1];
11367 if (M < NumElts && (Op.isUndef() || Op == V1))
11368 Op = V1;
11369 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11370 Op = V2;
11371 NormM -= NumElts;
11372 } else
11373 return SDValue();
11374
11375 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11376 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11377 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11378 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11379 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11380 if (MatchLoAnyLane || MatchHiAnyLane) {
11381 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11382 "Failed to match UNPCKLO/UNPCKHI");
11383 break;
11384 }
11385 }
11386 MatchLo &= MatchLoAnyLane;
11387 MatchHi &= MatchHiAnyLane;
11388 if (!MatchLo && !MatchHi)
11389 return SDValue();
11390 }
11391 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11392
11393 // Element indices have changed after unpacking. Calculate permute mask
11394 // so that they will be put back to the position as dictated by the
11395 // original shuffle mask indices.
11396 SmallVector<int, 32> PermuteMask(NumElts, -1);
11397 for (int Elt = 0; Elt != NumElts; ++Elt) {
11398 int M = Mask[Elt];
11399 if (M < 0)
11400 continue;
11401 int NormM = M;
11402 if (NumElts <= M)
11403 NormM -= NumElts;
11404 bool IsFirstOp = M < NumElts;
11405 int BaseMaskElt =
11406 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11407 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11408 PermuteMask[Elt] = BaseMaskElt;
11409 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11410 PermuteMask[Elt] = BaseMaskElt + 1;
11411 assert(PermuteMask[Elt] != -1 &&
11412 "Input mask element is defined but failed to assign permute mask");
11413 }
11414
11415 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11416 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11417 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11418}
11419
11420/// Try to lower a shuffle as a permute of the inputs followed by an
11421/// UNPCK instruction.
11422///
11423/// This specifically targets cases where we end up with alternating between
11424/// the two inputs, and so can permute them into something that feeds a single
11425/// UNPCK instruction. Note that this routine only targets integer vectors
11426/// because for floating point vectors we have a generalized SHUFPS lowering
11427/// strategy that handles everything that doesn't *exactly* match an unpack,
11428/// making this clever lowering unnecessary.
11430 SDValue V1, SDValue V2,
11431 ArrayRef<int> Mask,
11432 const X86Subtarget &Subtarget,
11433 SelectionDAG &DAG) {
11434 int Size = Mask.size();
11435 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11436
11437 // This routine only supports 128-bit integer dual input vectors.
11438 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11439 return SDValue();
11440
11441 int NumLoInputs =
11442 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11443 int NumHiInputs =
11444 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11445
11446 bool UnpackLo = NumLoInputs >= NumHiInputs;
11447
11448 auto TryUnpack = [&](int ScalarSize, int Scale) {
11449 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11450 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11451
11452 for (int i = 0; i < Size; ++i) {
11453 if (Mask[i] < 0)
11454 continue;
11455
11456 // Each element of the unpack contains Scale elements from this mask.
11457 int UnpackIdx = i / Scale;
11458
11459 // We only handle the case where V1 feeds the first slots of the unpack.
11460 // We rely on canonicalization to ensure this is the case.
11461 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11462 return SDValue();
11463
11464 // Setup the mask for this input. The indexing is tricky as we have to
11465 // handle the unpack stride.
11466 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11467 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11468 Mask[i] % Size;
11469 }
11470
11471 // If we will have to shuffle both inputs to use the unpack, check whether
11472 // we can just unpack first and shuffle the result. If so, skip this unpack.
11473 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11474 !isNoopShuffleMask(V2Mask))
11475 return SDValue();
11476
11477 // Shuffle the inputs into place.
11478 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11479 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11480
11481 // Cast the inputs to the type we will use to unpack them.
11482 MVT UnpackVT =
11483 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11484 V1 = DAG.getBitcast(UnpackVT, V1);
11485 V2 = DAG.getBitcast(UnpackVT, V2);
11486
11487 // Unpack the inputs and cast the result back to the desired type.
11488 return DAG.getBitcast(
11489 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11490 UnpackVT, V1, V2));
11491 };
11492
11493 // We try each unpack from the largest to the smallest to try and find one
11494 // that fits this mask.
11495 int OrigScalarSize = VT.getScalarSizeInBits();
11496 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11497 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11498 return Unpack;
11499
11500 // If we're shuffling with a zero vector then we're better off not doing
11501 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11504 return SDValue();
11505
11506 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11507 // initial unpack.
11508 if (NumLoInputs == 0 || NumHiInputs == 0) {
11509 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11510 "We have to have *some* inputs!");
11511 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11512
11513 // FIXME: We could consider the total complexity of the permute of each
11514 // possible unpacking. Or at the least we should consider how many
11515 // half-crossings are created.
11516 // FIXME: We could consider commuting the unpacks.
11517
11518 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11519 for (int i = 0; i < Size; ++i) {
11520 if (Mask[i] < 0)
11521 continue;
11522
11523 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11524
11525 PermMask[i] =
11526 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11527 }
11528 return DAG.getVectorShuffle(
11529 VT, DL,
11530 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11531 V1, V2),
11532 DAG.getUNDEF(VT), PermMask);
11533 }
11534
11535 return SDValue();
11536}
11537
11538/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11539/// permuting the elements of the result in place.
11541 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11542 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11543 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11544 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11545 (VT.is512BitVector() && !Subtarget.hasBWI()))
11546 return SDValue();
11547
11548 // We don't currently support lane crossing permutes.
11549 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11550 return SDValue();
11551
11552 int Scale = VT.getScalarSizeInBits() / 8;
11553 int NumLanes = VT.getSizeInBits() / 128;
11554 int NumElts = VT.getVectorNumElements();
11555 int NumEltsPerLane = NumElts / NumLanes;
11556
11557 // Determine range of mask elts.
11558 bool Blend1 = true;
11559 bool Blend2 = true;
11560 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11561 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11562 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11563 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11564 int M = Mask[Lane + Elt];
11565 if (M < 0)
11566 continue;
11567 if (M < NumElts) {
11568 Blend1 &= (M == (Lane + Elt));
11569 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11570 M = M % NumEltsPerLane;
11571 Range1.first = std::min(Range1.first, M);
11572 Range1.second = std::max(Range1.second, M);
11573 } else {
11574 M -= NumElts;
11575 Blend2 &= (M == (Lane + Elt));
11576 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11577 M = M % NumEltsPerLane;
11578 Range2.first = std::min(Range2.first, M);
11579 Range2.second = std::max(Range2.second, M);
11580 }
11581 }
11582 }
11583
11584 // Bail if we don't need both elements.
11585 // TODO - it might be worth doing this for unary shuffles if the permute
11586 // can be widened.
11587 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11588 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11589 return SDValue();
11590
11591 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11592 return SDValue();
11593
11594 // Rotate the 2 ops so we can access both ranges, then permute the result.
11595 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11596 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11597 SDValue Rotate = DAG.getBitcast(
11598 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11599 DAG.getBitcast(ByteVT, Lo),
11600 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11601 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11602 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11603 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11604 int M = Mask[Lane + Elt];
11605 if (M < 0)
11606 continue;
11607 if (M < NumElts)
11608 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11609 else
11610 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11611 }
11612 }
11613 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11614 };
11615
11616 // Check if the ranges are small enough to rotate from either direction.
11617 if (Range2.second < Range1.first)
11618 return RotateAndPermute(V1, V2, Range1.first, 0);
11619 if (Range1.second < Range2.first)
11620 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11621 return SDValue();
11622}
11623
11625 return isUndefOrEqual(Mask, 0);
11626}
11627
11629 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11630}
11631
11632/// Check if the Mask consists of the same element repeated multiple times.
11634 size_t NumUndefs = 0;
11635 std::optional<int> UniqueElt;
11636 for (int Elt : Mask) {
11637 if (Elt == SM_SentinelUndef) {
11638 NumUndefs++;
11639 continue;
11640 }
11641 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11642 return false;
11643 UniqueElt = Elt;
11644 }
11645 // Make sure the element is repeated enough times by checking the number of
11646 // undefs is small.
11647 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11648}
11649
11650/// Generic routine to decompose a shuffle and blend into independent
11651/// blends and permutes.
11652///
11653/// This matches the extremely common pattern for handling combined
11654/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11655/// operations. It will try to pick the best arrangement of shuffles and
11656/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11658 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11659 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11660 int NumElts = Mask.size();
11661 int NumLanes = VT.getSizeInBits() / 128;
11662 int NumEltsPerLane = NumElts / NumLanes;
11663
11664 // Shuffle the input elements into the desired positions in V1 and V2 and
11665 // unpack/blend them together.
11666 bool IsAlternating = true;
11667 bool V1Zero = true, V2Zero = true;
11668 SmallVector<int, 32> V1Mask(NumElts, -1);
11669 SmallVector<int, 32> V2Mask(NumElts, -1);
11670 SmallVector<int, 32> FinalMask(NumElts, -1);
11671 for (int i = 0; i < NumElts; ++i) {
11672 int M = Mask[i];
11673 if (M >= 0 && M < NumElts) {
11674 V1Mask[i] = M;
11675 FinalMask[i] = i;
11676 V1Zero &= Zeroable[i];
11677 IsAlternating &= (i & 1) == 0;
11678 } else if (M >= NumElts) {
11679 V2Mask[i] = M - NumElts;
11680 FinalMask[i] = i + NumElts;
11681 V2Zero &= Zeroable[i];
11682 IsAlternating &= (i & 1) == 1;
11683 }
11684 }
11685
11686 // If we effectively only demand the 0'th element of \p Input, and not only
11687 // as 0'th element, then broadcast said input,
11688 // and change \p InputMask to be a no-op (identity) mask.
11689 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11690 &DAG](SDValue &Input,
11691 MutableArrayRef<int> InputMask) {
11692 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11693 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11694 !X86::mayFoldLoad(Input, Subtarget)))
11695 return;
11696 if (isNoopShuffleMask(InputMask))
11697 return;
11698 assert(isBroadcastShuffleMask(InputMask) &&
11699 "Expected to demand only the 0'th element.");
11701 for (auto I : enumerate(InputMask)) {
11702 int &InputMaskElt = I.value();
11703 if (InputMaskElt >= 0)
11704 InputMaskElt = I.index();
11705 }
11706 };
11707
11708 // Currently, we may need to produce one shuffle per input, and blend results.
11709 // It is possible that the shuffle for one of the inputs is already a no-op.
11710 // See if we can simplify non-no-op shuffles into broadcasts,
11711 // which we consider to be strictly better than an arbitrary shuffle.
11712 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11714 canonicalizeBroadcastableInput(V1, V1Mask);
11715 canonicalizeBroadcastableInput(V2, V2Mask);
11716 }
11717
11718 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11719 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11720 // the shuffle may be able to fold with a load or other benefit. However, when
11721 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11722 // pre-shuffle first is a better strategy.
11723 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11724 // Only prefer immediate blends to unpack/rotate.
11725 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11726 DAG, true))
11727 return BlendPerm;
11728 // If either input vector provides only a single element which is repeated
11729 // multiple times, unpacking from both input vectors would generate worse
11730 // code. e.g. for
11731 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11732 // it is better to process t4 first to create a vector of t4[0], then unpack
11733 // that vector with t2.
11734 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11736 if (SDValue UnpackPerm =
11737 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11738 return UnpackPerm;
11740 DL, VT, V1, V2, Mask, Subtarget, DAG))
11741 return RotatePerm;
11742 // Unpack/rotate failed - try again with variable blends.
11743 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11744 DAG))
11745 return BlendPerm;
11746 if (VT.getScalarSizeInBits() >= 32)
11747 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11748 DL, VT, V1, V2, Mask, Subtarget, DAG))
11749 return PermUnpack;
11750 }
11751
11752 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11753 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11754 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11755 // than half the elements coming from each source.
11756 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11757 V1Mask.assign(NumElts, -1);
11758 V2Mask.assign(NumElts, -1);
11759 FinalMask.assign(NumElts, -1);
11760 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11761 for (int j = 0; j != NumEltsPerLane; ++j) {
11762 int M = Mask[i + j];
11763 if (M >= 0 && M < NumElts) {
11764 V1Mask[i + (j / 2)] = M;
11765 FinalMask[i + j] = i + (j / 2);
11766 } else if (M >= NumElts) {
11767 V2Mask[i + (j / 2)] = M - NumElts;
11768 FinalMask[i + j] = i + (j / 2) + NumElts;
11769 }
11770 }
11771 }
11772
11773 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11774 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11775 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11776}
11777
11778static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11779 const X86Subtarget &Subtarget,
11780 ArrayRef<int> Mask) {
11781 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11782 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11783
11784 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11785 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11786 int MaxSubElts = 64 / EltSizeInBits;
11787 unsigned RotateAmt, NumSubElts;
11788 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11789 MaxSubElts, NumSubElts, RotateAmt))
11790 return -1;
11791 unsigned NumElts = Mask.size();
11792 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11793 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11794 return RotateAmt;
11795}
11796
11797/// Lower shuffle using X86ISD::VROTLI rotations.
11799 ArrayRef<int> Mask,
11800 const X86Subtarget &Subtarget,
11801 SelectionDAG &DAG) {
11802 // Only XOP + AVX512 targets have bit rotation instructions.
11803 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11804 bool IsLegal =
11805 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11806 if (!IsLegal && Subtarget.hasSSE3())
11807 return SDValue();
11808
11809 MVT RotateVT;
11810 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11811 Subtarget, Mask);
11812 if (RotateAmt < 0)
11813 return SDValue();
11814
11815 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11816 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11817 // widen to vXi16 or more then existing lowering should will be better.
11818 if (!IsLegal) {
11819 if ((RotateAmt % 16) == 0)
11820 return SDValue();
11821 // TODO: Use getTargetVShiftByConstNode.
11822 unsigned ShlAmt = RotateAmt;
11823 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11824 V1 = DAG.getBitcast(RotateVT, V1);
11825 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11826 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11827 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11828 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11829 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11830 return DAG.getBitcast(VT, Rot);
11831 }
11832
11833 SDValue Rot =
11834 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11835 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11836 return DAG.getBitcast(VT, Rot);
11837}
11838
11839/// Try to match a vector shuffle as an element rotation.
11840///
11841/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11843 ArrayRef<int> Mask) {
11844 int NumElts = Mask.size();
11845
11846 // We need to detect various ways of spelling a rotation:
11847 // [11, 12, 13, 14, 15, 0, 1, 2]
11848 // [-1, 12, 13, 14, -1, -1, 1, -1]
11849 // [-1, -1, -1, -1, -1, -1, 1, 2]
11850 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11851 // [-1, 4, 5, 6, -1, -1, 9, -1]
11852 // [-1, 4, 5, 6, -1, -1, -1, -1]
11853 int Rotation = 0;
11854 SDValue Lo, Hi;
11855 for (int i = 0; i < NumElts; ++i) {
11856 int M = Mask[i];
11857 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11858 "Unexpected mask index.");
11859 if (M < 0)
11860 continue;
11861
11862 // Determine where a rotated vector would have started.
11863 int StartIdx = i - (M % NumElts);
11864 if (StartIdx == 0)
11865 // The identity rotation isn't interesting, stop.
11866 return -1;
11867
11868 // If we found the tail of a vector the rotation must be the missing
11869 // front. If we found the head of a vector, it must be how much of the
11870 // head.
11871 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11872
11873 if (Rotation == 0)
11874 Rotation = CandidateRotation;
11875 else if (Rotation != CandidateRotation)
11876 // The rotations don't match, so we can't match this mask.
11877 return -1;
11878
11879 // Compute which value this mask is pointing at.
11880 SDValue MaskV = M < NumElts ? V1 : V2;
11881
11882 // Compute which of the two target values this index should be assigned
11883 // to. This reflects whether the high elements are remaining or the low
11884 // elements are remaining.
11885 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11886
11887 // Either set up this value if we've not encountered it before, or check
11888 // that it remains consistent.
11889 if (!TargetV)
11890 TargetV = MaskV;
11891 else if (TargetV != MaskV)
11892 // This may be a rotation, but it pulls from the inputs in some
11893 // unsupported interleaving.
11894 return -1;
11895 }
11896
11897 // Check that we successfully analyzed the mask, and normalize the results.
11898 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11899 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11900 if (!Lo)
11901 Lo = Hi;
11902 else if (!Hi)
11903 Hi = Lo;
11904
11905 V1 = Lo;
11906 V2 = Hi;
11907
11908 return Rotation;
11909}
11910
11911/// Try to lower a vector shuffle as a byte rotation.
11912///
11913/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11914/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11915/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11916/// try to generically lower a vector shuffle through such an pattern. It
11917/// does not check for the profitability of lowering either as PALIGNR or
11918/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11919/// This matches shuffle vectors that look like:
11920///
11921/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11922///
11923/// Essentially it concatenates V1 and V2, shifts right by some number of
11924/// elements, and takes the low elements as the result. Note that while this is
11925/// specified as a *right shift* because x86 is little-endian, it is a *left
11926/// rotate* of the vector lanes.
11928 ArrayRef<int> Mask) {
11929 // Don't accept any shuffles with zero elements.
11930 if (isAnyZero(Mask))
11931 return -1;
11932
11933 // PALIGNR works on 128-bit lanes.
11934 SmallVector<int, 16> RepeatedMask;
11935 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11936 return -1;
11937
11938 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11939 if (Rotation <= 0)
11940 return -1;
11941
11942 // PALIGNR rotates bytes, so we need to scale the
11943 // rotation based on how many bytes are in the vector lane.
11944 int NumElts = RepeatedMask.size();
11945 int Scale = 16 / NumElts;
11946 return Rotation * Scale;
11947}
11948
11950 SDValue V2, ArrayRef<int> Mask,
11951 const X86Subtarget &Subtarget,
11952 SelectionDAG &DAG) {
11953 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11954
11955 SDValue Lo = V1, Hi = V2;
11956 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11957 if (ByteRotation <= 0)
11958 return SDValue();
11959
11960 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11961 // PSLLDQ/PSRLDQ.
11962 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11963 Lo = DAG.getBitcast(ByteVT, Lo);
11964 Hi = DAG.getBitcast(ByteVT, Hi);
11965
11966 // SSSE3 targets can use the palignr instruction.
11967 if (Subtarget.hasSSSE3()) {
11968 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11969 "512-bit PALIGNR requires BWI instructions");
11970 return DAG.getBitcast(
11971 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11972 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11973 }
11974
11975 assert(VT.is128BitVector() &&
11976 "Rotate-based lowering only supports 128-bit lowering!");
11977 assert(Mask.size() <= 16 &&
11978 "Can shuffle at most 16 bytes in a 128-bit vector!");
11979 assert(ByteVT == MVT::v16i8 &&
11980 "SSE2 rotate lowering only needed for v16i8!");
11981
11982 // Default SSE2 implementation
11983 int LoByteShift = 16 - ByteRotation;
11984 int HiByteShift = ByteRotation;
11985
11986 SDValue LoShift =
11987 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11988 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11989 SDValue HiShift =
11990 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11991 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11992 return DAG.getBitcast(VT,
11993 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11994}
11995
11996/// Try to lower a vector shuffle as a dword/qword rotation.
11997///
11998/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11999/// rotation of the concatenation of two vectors; This routine will
12000/// try to generically lower a vector shuffle through such an pattern.
12001///
12002/// Essentially it concatenates V1 and V2, shifts right by some number of
12003/// elements, and takes the low elements as the result. Note that while this is
12004/// specified as a *right shift* because x86 is little-endian, it is a *left
12005/// rotate* of the vector lanes.
12007 SDValue V2, ArrayRef<int> Mask,
12008 const APInt &Zeroable,
12009 const X86Subtarget &Subtarget,
12010 SelectionDAG &DAG) {
12011 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12012 "Only 32-bit and 64-bit elements are supported!");
12013
12014 // 128/256-bit vectors are only supported with VLX.
12015 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12016 && "VLX required for 128/256-bit vectors");
12017
12018 SDValue Lo = V1, Hi = V2;
12019 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12020 if (0 < Rotation)
12021 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12022 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12023
12024 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12025 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12026 // TODO: We can probably make this more aggressive and use shift-pairs like
12027 // lowerShuffleAsByteShiftMask.
12028 unsigned NumElts = Mask.size();
12029 unsigned ZeroLo = Zeroable.countr_one();
12030 unsigned ZeroHi = Zeroable.countl_one();
12031 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12032 if (!ZeroLo && !ZeroHi)
12033 return SDValue();
12034
12035 if (ZeroLo) {
12036 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12037 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12038 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12039 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12040 getZeroVector(VT, Subtarget, DAG, DL),
12041 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12042 }
12043
12044 if (ZeroHi) {
12045 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12046 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12047 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12048 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12049 getZeroVector(VT, Subtarget, DAG, DL), Src,
12050 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12051 }
12052
12053 return SDValue();
12054}
12055
12056/// Try to lower a vector shuffle as a byte shift sequence.
12058 SDValue V2, ArrayRef<int> Mask,
12059 const APInt &Zeroable,
12060 const X86Subtarget &Subtarget,
12061 SelectionDAG &DAG) {
12062 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12063 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12064
12065 // We need a shuffle that has zeros at one/both ends and a sequential
12066 // shuffle from one source within.
12067 unsigned ZeroLo = Zeroable.countr_one();
12068 unsigned ZeroHi = Zeroable.countl_one();
12069 if (!ZeroLo && !ZeroHi)
12070 return SDValue();
12071
12072 unsigned NumElts = Mask.size();
12073 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12074 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12075 return SDValue();
12076
12077 unsigned Scale = VT.getScalarSizeInBits() / 8;
12078 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12079 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12080 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12081 return SDValue();
12082
12083 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12084 Res = DAG.getBitcast(MVT::v16i8, Res);
12085
12086 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12087 // inner sequential set of elements, possibly offset:
12088 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12089 // 01234567 --> 4567zzzz --> zzzzz456
12090 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12091 if (ZeroLo == 0) {
12092 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12093 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12094 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12095 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12096 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12097 } else if (ZeroHi == 0) {
12098 unsigned Shift = Mask[ZeroLo] % NumElts;
12099 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12100 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12101 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12102 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12103 } else if (!Subtarget.hasSSSE3()) {
12104 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12105 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12106 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12107 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12108 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12109 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12110 Shift += Mask[ZeroLo] % NumElts;
12111 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12112 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12113 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12114 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12115 } else
12116 return SDValue();
12117
12118 return DAG.getBitcast(VT, Res);
12119}
12120
12121/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12122///
12123/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12124/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12125/// matches elements from one of the input vectors shuffled to the left or
12126/// right with zeroable elements 'shifted in'. It handles both the strictly
12127/// bit-wise element shifts and the byte shift across an entire 128-bit double
12128/// quad word lane.
12129///
12130/// PSHL : (little-endian) left bit shift.
12131/// [ zz, 0, zz, 2 ]
12132/// [ -1, 4, zz, -1 ]
12133/// PSRL : (little-endian) right bit shift.
12134/// [ 1, zz, 3, zz]
12135/// [ -1, -1, 7, zz]
12136/// PSLLDQ : (little-endian) left byte shift
12137/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12138/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12139/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12140/// PSRLDQ : (little-endian) right byte shift
12141/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12142/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12143/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12144static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12145 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12146 int MaskOffset, const APInt &Zeroable,
12147 const X86Subtarget &Subtarget) {
12148 int Size = Mask.size();
12149 unsigned SizeInBits = Size * ScalarSizeInBits;
12150
12151 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12152 for (int i = 0; i < Size; i += Scale)
12153 for (int j = 0; j < Shift; ++j)
12154 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12155 return false;
12156
12157 return true;
12158 };
12159
12160 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12161 for (int i = 0; i != Size; i += Scale) {
12162 unsigned Pos = Left ? i + Shift : i;
12163 unsigned Low = Left ? i : i + Shift;
12164 unsigned Len = Scale - Shift;
12165 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12166 return -1;
12167 }
12168
12169 int ShiftEltBits = ScalarSizeInBits * Scale;
12170 bool ByteShift = ShiftEltBits > 64;
12171 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12172 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12173 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12174
12175 // Normalize the scale for byte shifts to still produce an i64 element
12176 // type.
12177 Scale = ByteShift ? Scale / 2 : Scale;
12178
12179 // We need to round trip through the appropriate type for the shift.
12180 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12181 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12182 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12183 return (int)ShiftAmt;
12184 };
12185
12186 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12187 // keep doubling the size of the integer elements up to that. We can
12188 // then shift the elements of the integer vector by whole multiples of
12189 // their width within the elements of the larger integer vector. Test each
12190 // multiple to see if we can find a match with the moved element indices
12191 // and that the shifted in elements are all zeroable.
12192 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12193 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12194 for (int Shift = 1; Shift != Scale; ++Shift)
12195 for (bool Left : {true, false})
12196 if (CheckZeros(Shift, Scale, Left)) {
12197 int ShiftAmt = MatchShift(Shift, Scale, Left);
12198 if (0 < ShiftAmt)
12199 return ShiftAmt;
12200 }
12201
12202 // no match
12203 return -1;
12204}
12205
12207 SDValue V2, ArrayRef<int> Mask,
12208 const APInt &Zeroable,
12209 const X86Subtarget &Subtarget,
12210 SelectionDAG &DAG, bool BitwiseOnly) {
12211 int Size = Mask.size();
12212 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12213
12214 MVT ShiftVT;
12215 SDValue V = V1;
12216 unsigned Opcode;
12217
12218 // Try to match shuffle against V1 shift.
12219 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12220 Mask, 0, Zeroable, Subtarget);
12221
12222 // If V1 failed, try to match shuffle against V2 shift.
12223 if (ShiftAmt < 0) {
12224 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12225 Mask, Size, Zeroable, Subtarget);
12226 V = V2;
12227 }
12228
12229 if (ShiftAmt < 0)
12230 return SDValue();
12231
12232 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12233 return SDValue();
12234
12235 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12236 "Illegal integer vector type");
12237 V = DAG.getBitcast(ShiftVT, V);
12238 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12239 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12240 return DAG.getBitcast(VT, V);
12241}
12242
12243// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12244// Remainder of lower half result is zero and upper half is all undef.
12245static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12246 ArrayRef<int> Mask, uint64_t &BitLen,
12247 uint64_t &BitIdx, const APInt &Zeroable) {
12248 int Size = Mask.size();
12249 int HalfSize = Size / 2;
12250 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12251 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12252
12253 // Upper half must be undefined.
12254 if (!isUndefUpperHalf(Mask))
12255 return false;
12256
12257 // Determine the extraction length from the part of the
12258 // lower half that isn't zeroable.
12259 int Len = HalfSize;
12260 for (; Len > 0; --Len)
12261 if (!Zeroable[Len - 1])
12262 break;
12263 assert(Len > 0 && "Zeroable shuffle mask");
12264
12265 // Attempt to match first Len sequential elements from the lower half.
12266 SDValue Src;
12267 int Idx = -1;
12268 for (int i = 0; i != Len; ++i) {
12269 int M = Mask[i];
12270 if (M == SM_SentinelUndef)
12271 continue;
12272 SDValue &V = (M < Size ? V1 : V2);
12273 M = M % Size;
12274
12275 // The extracted elements must start at a valid index and all mask
12276 // elements must be in the lower half.
12277 if (i > M || M >= HalfSize)
12278 return false;
12279
12280 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12281 Src = V;
12282 Idx = M - i;
12283 continue;
12284 }
12285 return false;
12286 }
12287
12288 if (!Src || Idx < 0)
12289 return false;
12290
12291 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12292 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12293 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12294 V1 = Src;
12295 return true;
12296}
12297
12298// INSERTQ: Extract lowest Len elements from lower half of second source and
12299// insert over first source, starting at Idx.
12300// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12301static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12302 ArrayRef<int> Mask, uint64_t &BitLen,
12303 uint64_t &BitIdx) {
12304 int Size = Mask.size();
12305 int HalfSize = Size / 2;
12306 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12307
12308 // Upper half must be undefined.
12309 if (!isUndefUpperHalf(Mask))
12310 return false;
12311
12312 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12313 SDValue Base;
12314
12315 // Attempt to match first source from mask before insertion point.
12316 if (isUndefInRange(Mask, 0, Idx)) {
12317 /* EMPTY */
12318 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12319 Base = V1;
12320 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12321 Base = V2;
12322 } else {
12323 continue;
12324 }
12325
12326 // Extend the extraction length looking to match both the insertion of
12327 // the second source and the remaining elements of the first.
12328 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12329 SDValue Insert;
12330 int Len = Hi - Idx;
12331
12332 // Match insertion.
12333 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12334 Insert = V1;
12335 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12336 Insert = V2;
12337 } else {
12338 continue;
12339 }
12340
12341 // Match the remaining elements of the lower half.
12342 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12343 /* EMPTY */
12344 } else if ((!Base || (Base == V1)) &&
12345 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12346 Base = V1;
12347 } else if ((!Base || (Base == V2)) &&
12348 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12349 Size + Hi)) {
12350 Base = V2;
12351 } else {
12352 continue;
12353 }
12354
12355 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12356 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12357 V1 = Base;
12358 V2 = Insert;
12359 return true;
12360 }
12361 }
12362
12363 return false;
12364}
12365
12366/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12368 SDValue V2, ArrayRef<int> Mask,
12369 const APInt &Zeroable, SelectionDAG &DAG) {
12370 uint64_t BitLen, BitIdx;
12371 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12372 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12373 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12374 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12375
12376 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12377 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12378 V2 ? V2 : DAG.getUNDEF(VT),
12379 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12380 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12381
12382 return SDValue();
12383}
12384
12385/// Lower a vector shuffle as an any/signed/zero extension.
12386///
12387/// Given a specific number of elements, element bit width, and extension
12388/// stride, produce either an extension based on the available
12389/// features of the subtarget. The extended elements are consecutive and
12390/// begin and can start from an offsetted element index in the input; to
12391/// avoid excess shuffling the offset must either being in the bottom lane
12392/// or at the start of a higher lane. All extended elements must be from
12393/// the same lane.
12395 int Scale, int Offset,
12396 unsigned ExtOpc, SDValue InputV,
12397 ArrayRef<int> Mask,
12398 const X86Subtarget &Subtarget,
12399 SelectionDAG &DAG) {
12400 assert(Scale > 1 && "Need a scale to extend.");
12401 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12402 int EltBits = VT.getScalarSizeInBits();
12403 int NumElements = VT.getVectorNumElements();
12404 int NumEltsPerLane = 128 / EltBits;
12405 int OffsetLane = Offset / NumEltsPerLane;
12406 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12407 "Only 8, 16, and 32 bit elements can be extended.");
12408 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12409 assert(0 <= Offset && "Extension offset must be positive.");
12410 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12411 "Extension offset must be in the first lane or start an upper lane.");
12412
12413 // Check that an index is in same lane as the base offset.
12414 auto SafeOffset = [&](int Idx) {
12415 return OffsetLane == (Idx / NumEltsPerLane);
12416 };
12417
12418 // Shift along an input so that the offset base moves to the first element.
12419 auto ShuffleOffset = [&](SDValue V) {
12420 if (!Offset)
12421 return V;
12422
12423 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12424 for (int i = 0; i * Scale < NumElements; ++i) {
12425 int SrcIdx = i + Offset;
12426 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12427 }
12428 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12429 };
12430
12431 // Found a valid a/zext mask! Try various lowering strategies based on the
12432 // input type and available ISA extensions.
12433 if (Subtarget.hasSSE41()) {
12434 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12435 // PUNPCK will catch this in a later shuffle match.
12436 if (Offset && Scale == 2 && VT.is128BitVector())
12437 return SDValue();
12438 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12439 NumElements / Scale);
12440 InputV = DAG.getBitcast(VT, InputV);
12441 InputV = ShuffleOffset(InputV);
12442 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12443 return DAG.getBitcast(VT, InputV);
12444 }
12445
12446 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12447 InputV = DAG.getBitcast(VT, InputV);
12448 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12449
12450 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12451 if (ExtOpc == ISD::SIGN_EXTEND)
12452 return SDValue();
12453
12454 // For any extends we can cheat for larger element sizes and use shuffle
12455 // instructions that can fold with a load and/or copy.
12456 if (AnyExt && EltBits == 32) {
12457 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12458 -1};
12459 return DAG.getBitcast(
12460 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12461 DAG.getBitcast(MVT::v4i32, InputV),
12462 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12463 }
12464 if (AnyExt && EltBits == 16 && Scale > 2) {
12465 int PSHUFDMask[4] = {Offset / 2, -1,
12466 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12467 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12468 DAG.getBitcast(MVT::v4i32, InputV),
12469 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12470 int PSHUFWMask[4] = {1, -1, -1, -1};
12471 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12472 return DAG.getBitcast(
12473 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12474 DAG.getBitcast(MVT::v8i16, InputV),
12475 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12476 }
12477
12478 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12479 // to 64-bits.
12480 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12481 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12482 assert(VT.is128BitVector() && "Unexpected vector width!");
12483
12484 int LoIdx = Offset * EltBits;
12485 SDValue Lo = DAG.getBitcast(
12486 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12487 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12488 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12489
12490 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12491 return DAG.getBitcast(VT, Lo);
12492
12493 int HiIdx = (Offset + 1) * EltBits;
12494 SDValue Hi = DAG.getBitcast(
12495 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12496 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12497 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12498 return DAG.getBitcast(VT,
12499 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12500 }
12501
12502 // If this would require more than 2 unpack instructions to expand, use
12503 // pshufb when available. We can only use more than 2 unpack instructions
12504 // when zero extending i8 elements which also makes it easier to use pshufb.
12505 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12506 assert(NumElements == 16 && "Unexpected byte vector width!");
12507 SDValue PSHUFBMask[16];
12508 for (int i = 0; i < 16; ++i) {
12509 int Idx = Offset + (i / Scale);
12510 if ((i % Scale == 0 && SafeOffset(Idx))) {
12511 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12512 continue;
12513 }
12514 PSHUFBMask[i] =
12515 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12516 }
12517 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12518 return DAG.getBitcast(
12519 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12520 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12521 }
12522
12523 // If we are extending from an offset, ensure we start on a boundary that
12524 // we can unpack from.
12525 int AlignToUnpack = Offset % (NumElements / Scale);
12526 if (AlignToUnpack) {
12527 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12528 for (int i = AlignToUnpack; i < NumElements; ++i)
12529 ShMask[i - AlignToUnpack] = i;
12530 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12531 Offset -= AlignToUnpack;
12532 }
12533
12534 // Otherwise emit a sequence of unpacks.
12535 do {
12536 unsigned UnpackLoHi = X86ISD::UNPCKL;
12537 if (Offset >= (NumElements / 2)) {
12538 UnpackLoHi = X86ISD::UNPCKH;
12539 Offset -= (NumElements / 2);
12540 }
12541
12542 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12543 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12544 : getZeroVector(InputVT, Subtarget, DAG, DL);
12545 InputV = DAG.getBitcast(InputVT, InputV);
12546 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12547 Scale /= 2;
12548 EltBits *= 2;
12549 NumElements /= 2;
12550 } while (Scale > 1);
12551 return DAG.getBitcast(VT, InputV);
12552}
12553
12554/// Try to lower a vector shuffle as a zero extension on any microarch.
12555///
12556/// This routine will try to do everything in its power to cleverly lower
12557/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12558/// check for the profitability of this lowering, it tries to aggressively
12559/// match this pattern. It will use all of the micro-architectural details it
12560/// can to emit an efficient lowering. It handles both blends with all-zero
12561/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12562/// masking out later).
12563///
12564/// The reason we have dedicated lowering for zext-style shuffles is that they
12565/// are both incredibly common and often quite performance sensitive.
12567 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12568 const APInt &Zeroable, const X86Subtarget &Subtarget,
12569 SelectionDAG &DAG) {
12570 int Bits = VT.getSizeInBits();
12571 int NumLanes = Bits / 128;
12572 int NumElements = VT.getVectorNumElements();
12573 int NumEltsPerLane = NumElements / NumLanes;
12574 assert(VT.getScalarSizeInBits() <= 32 &&
12575 "Exceeds 32-bit integer zero extension limit");
12576 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12577
12578 // Define a helper function to check a particular ext-scale and lower to it if
12579 // valid.
12580 auto Lower = [&](int Scale) -> SDValue {
12581 SDValue InputV;
12582 bool AnyExt = true;
12583 int Offset = 0;
12584 int Matches = 0;
12585 for (int i = 0; i < NumElements; ++i) {
12586 int M = Mask[i];
12587 if (M < 0)
12588 continue; // Valid anywhere but doesn't tell us anything.
12589 if (i % Scale != 0) {
12590 // Each of the extended elements need to be zeroable.
12591 if (!Zeroable[i])
12592 return SDValue();
12593
12594 // We no longer are in the anyext case.
12595 AnyExt = false;
12596 continue;
12597 }
12598
12599 // Each of the base elements needs to be consecutive indices into the
12600 // same input vector.
12601 SDValue V = M < NumElements ? V1 : V2;
12602 M = M % NumElements;
12603 if (!InputV) {
12604 InputV = V;
12605 Offset = M - (i / Scale);
12606 } else if (InputV != V)
12607 return SDValue(); // Flip-flopping inputs.
12608
12609 // Offset must start in the lowest 128-bit lane or at the start of an
12610 // upper lane.
12611 // FIXME: Is it ever worth allowing a negative base offset?
12612 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12613 (Offset % NumEltsPerLane) == 0))
12614 return SDValue();
12615
12616 // If we are offsetting, all referenced entries must come from the same
12617 // lane.
12618 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12619 return SDValue();
12620
12621 if ((M % NumElements) != (Offset + (i / Scale)))
12622 return SDValue(); // Non-consecutive strided elements.
12623 Matches++;
12624 }
12625
12626 // If we fail to find an input, we have a zero-shuffle which should always
12627 // have already been handled.
12628 // FIXME: Maybe handle this here in case during blending we end up with one?
12629 if (!InputV)
12630 return SDValue();
12631
12632 // If we are offsetting, don't extend if we only match a single input, we
12633 // can always do better by using a basic PSHUF or PUNPCK.
12634 if (Offset != 0 && Matches < 2)
12635 return SDValue();
12636
12637 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12638 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12639 InputV, Mask, Subtarget, DAG);
12640 };
12641
12642 // The widest scale possible for extending is to a 64-bit integer.
12643 assert(Bits % 64 == 0 &&
12644 "The number of bits in a vector must be divisible by 64 on x86!");
12645 int NumExtElements = Bits / 64;
12646
12647 // Each iteration, try extending the elements half as much, but into twice as
12648 // many elements.
12649 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12650 assert(NumElements % NumExtElements == 0 &&
12651 "The input vector size must be divisible by the extended size.");
12652 if (SDValue V = Lower(NumElements / NumExtElements))
12653 return V;
12654 }
12655
12656 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12657 if (Bits != 128)
12658 return SDValue();
12659
12660 // Returns one of the source operands if the shuffle can be reduced to a
12661 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12662 auto CanZExtLowHalf = [&]() {
12663 for (int i = NumElements / 2; i != NumElements; ++i)
12664 if (!Zeroable[i])
12665 return SDValue();
12666 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12667 return V1;
12668 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12669 return V2;
12670 return SDValue();
12671 };
12672
12673 if (SDValue V = CanZExtLowHalf()) {
12674 V = DAG.getBitcast(MVT::v2i64, V);
12675 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12676 return DAG.getBitcast(VT, V);
12677 }
12678
12679 // No viable ext lowering found.
12680 return SDValue();
12681}
12682
12683/// Try to get a scalar value for a specific element of a vector.
12684///
12685/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12687 SelectionDAG &DAG) {
12688 MVT VT = V.getSimpleValueType();
12689 MVT EltVT = VT.getVectorElementType();
12690 V = peekThroughBitcasts(V);
12691
12692 // If the bitcasts shift the element size, we can't extract an equivalent
12693 // element from it.
12694 MVT NewVT = V.getSimpleValueType();
12695 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12696 return SDValue();
12697
12698 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12699 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12700 // Ensure the scalar operand is the same size as the destination.
12701 // FIXME: Add support for scalar truncation where possible.
12702 SDValue S = V.getOperand(Idx);
12703 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12704 return DAG.getBitcast(EltVT, S);
12705 }
12706
12707 return SDValue();
12708}
12709
12710/// Helper to test for a load that can be folded with x86 shuffles.
12711///
12712/// This is particularly important because the set of instructions varies
12713/// significantly based on whether the operand is a load or not.
12715 return V.hasOneUse() &&
12717}
12718
12719template<typename T>
12720static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12721 T EltVT = VT.getScalarType();
12722 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12723 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12724}
12725
12726/// Try to lower insertion of a single element into a zero vector.
12727///
12728/// This is a common pattern that we have especially efficient patterns to lower
12729/// across all subtarget feature sets.
12731 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12732 const APInt &Zeroable, const X86Subtarget &Subtarget,
12733 SelectionDAG &DAG) {
12734 MVT ExtVT = VT;
12735 MVT EltVT = VT.getVectorElementType();
12736 unsigned NumElts = VT.getVectorNumElements();
12737 unsigned EltBits = VT.getScalarSizeInBits();
12738
12739 if (isSoftF16(EltVT, Subtarget))
12740 return SDValue();
12741
12742 int V2Index =
12743 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12744 Mask.begin();
12745 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12746 bool IsV1Zeroable = true;
12747 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12748 if (i != V2Index && !Zeroable[i]) {
12749 IsV1Zeroable = false;
12750 break;
12751 }
12752
12753 // Bail if a non-zero V1 isn't used in place.
12754 if (!IsV1Zeroable) {
12755 SmallVector<int, 8> V1Mask(Mask);
12756 V1Mask[V2Index] = -1;
12757 if (!isNoopShuffleMask(V1Mask))
12758 return SDValue();
12759 }
12760
12761 // Check for a single input from a SCALAR_TO_VECTOR node.
12762 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12763 // all the smarts here sunk into that routine. However, the current
12764 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12765 // vector shuffle lowering is dead.
12766 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12767 DAG);
12768 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12769 // We need to zext the scalar if it is smaller than an i32.
12770 V2S = DAG.getBitcast(EltVT, V2S);
12771 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12772 // Using zext to expand a narrow element won't work for non-zero
12773 // insertions. But we can use a masked constant vector if we're
12774 // inserting V2 into the bottom of V1.
12775 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12776 return SDValue();
12777
12778 // Zero-extend directly to i32.
12779 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12780 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12781
12782 // If we're inserting into a constant, mask off the inserted index
12783 // and OR with the zero-extended scalar.
12784 if (!IsV1Zeroable) {
12785 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12786 Bits[V2Index] = APInt::getZero(EltBits);
12787 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12788 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12789 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12790 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12791 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12792 }
12793 }
12794 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12795 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12796 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12797 // Either not inserting from the low element of the input or the input
12798 // element size is too small to use VZEXT_MOVL to clear the high bits.
12799 return SDValue();
12800 }
12801
12802 if (!IsV1Zeroable) {
12803 // If V1 can't be treated as a zero vector we have fewer options to lower
12804 // this. We can't support integer vectors or non-zero targets cheaply.
12805 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12806 if (!VT.isFloatingPoint() || V2Index != 0)
12807 return SDValue();
12808 if (!VT.is128BitVector())
12809 return SDValue();
12810
12811 // Otherwise, use MOVSD, MOVSS or MOVSH.
12812 unsigned MovOpc = 0;
12813 if (EltVT == MVT::f16)
12814 MovOpc = X86ISD::MOVSH;
12815 else if (EltVT == MVT::f32)
12816 MovOpc = X86ISD::MOVSS;
12817 else if (EltVT == MVT::f64)
12818 MovOpc = X86ISD::MOVSD;
12819 else
12820 llvm_unreachable("Unsupported floating point element type to handle!");
12821 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12822 }
12823
12824 // This lowering only works for the low element with floating point vectors.
12825 if (VT.isFloatingPoint() && V2Index != 0)
12826 return SDValue();
12827
12828 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12829 if (ExtVT != VT)
12830 V2 = DAG.getBitcast(VT, V2);
12831
12832 if (V2Index != 0) {
12833 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12834 // the desired position. Otherwise it is more efficient to do a vector
12835 // shift left. We know that we can do a vector shift left because all
12836 // the inputs are zero.
12837 if (VT.isFloatingPoint() || NumElts <= 4) {
12838 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12839 V2Shuffle[V2Index] = 0;
12840 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12841 } else {
12842 V2 = DAG.getBitcast(MVT::v16i8, V2);
12843 V2 = DAG.getNode(
12844 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12845 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12846 V2 = DAG.getBitcast(VT, V2);
12847 }
12848 }
12849 return V2;
12850}
12851
12852/// Try to lower broadcast of a single - truncated - integer element,
12853/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12854///
12855/// This assumes we have AVX2.
12857 int BroadcastIdx,
12858 const X86Subtarget &Subtarget,
12859 SelectionDAG &DAG) {
12860 assert(Subtarget.hasAVX2() &&
12861 "We can only lower integer broadcasts with AVX2!");
12862
12863 MVT EltVT = VT.getVectorElementType();
12864 MVT V0VT = V0.getSimpleValueType();
12865
12866 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12867 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12868
12869 MVT V0EltVT = V0VT.getVectorElementType();
12870 if (!V0EltVT.isInteger())
12871 return SDValue();
12872
12873 const unsigned EltSize = EltVT.getSizeInBits();
12874 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12875
12876 // This is only a truncation if the original element type is larger.
12877 if (V0EltSize <= EltSize)
12878 return SDValue();
12879
12880 assert(((V0EltSize % EltSize) == 0) &&
12881 "Scalar type sizes must all be powers of 2 on x86!");
12882
12883 const unsigned V0Opc = V0.getOpcode();
12884 const unsigned Scale = V0EltSize / EltSize;
12885 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12886
12887 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12888 V0Opc != ISD::BUILD_VECTOR)
12889 return SDValue();
12890
12891 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12892
12893 // If we're extracting non-least-significant bits, shift so we can truncate.
12894 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12895 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12896 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12897 if (const int OffsetIdx = BroadcastIdx % Scale)
12898 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12899 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12900
12901 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12902 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12903}
12904
12905/// Test whether this can be lowered with a single SHUFPS instruction.
12906///
12907/// This is used to disable more specialized lowerings when the shufps lowering
12908/// will happen to be efficient.
12910 // This routine only handles 128-bit shufps.
12911 assert(Mask.size() == 4 && "Unsupported mask size!");
12912 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12913 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12914 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12915 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12916
12917 // To lower with a single SHUFPS we need to have the low half and high half
12918 // each requiring a single input.
12919 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12920 return false;
12921 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12922 return false;
12923
12924 return true;
12925}
12926
12927/// Test whether the specified input (0 or 1) is in-place blended by the
12928/// given mask.
12929///
12930/// This returns true if the elements from a particular input are already in the
12931/// slot required by the given mask and require no permutation.
12933 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12934 int Size = Mask.size();
12935 for (int i = 0; i < Size; ++i)
12936 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12937 return false;
12938
12939 return true;
12940}
12941
12942/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12943/// the given mask.
12944///
12946 int BroadcastableElement = 0) {
12947 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12948 int Size = Mask.size();
12949 for (int i = 0; i < Size; ++i)
12950 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12951 Mask[i] % Size != BroadcastableElement)
12952 return false;
12953 return true;
12954}
12955
12956/// If we are extracting two 128-bit halves of a vector and shuffling the
12957/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12958/// multi-shuffle lowering.
12960 SDValue N1, ArrayRef<int> Mask,
12961 SelectionDAG &DAG) {
12962 MVT VT = N0.getSimpleValueType();
12963 assert((VT.is128BitVector() &&
12964 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12965 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12966
12967 // Check that both sources are extracts of the same source vector.
12968 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12970 N0.getOperand(0) != N1.getOperand(0) ||
12971 !N0.hasOneUse() || !N1.hasOneUse())
12972 return SDValue();
12973
12974 SDValue WideVec = N0.getOperand(0);
12975 MVT WideVT = WideVec.getSimpleValueType();
12976 if (!WideVT.is256BitVector())
12977 return SDValue();
12978
12979 // Match extracts of each half of the wide source vector. Commute the shuffle
12980 // if the extract of the low half is N1.
12981 unsigned NumElts = VT.getVectorNumElements();
12982 SmallVector<int, 4> NewMask(Mask);
12983 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12984 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12985 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12987 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12988 return SDValue();
12989
12990 // Final bailout: if the mask is simple, we are better off using an extract
12991 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12992 // because that avoids a constant load from memory.
12993 if (NumElts == 4 &&
12994 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12995 return SDValue();
12996
12997 // Extend the shuffle mask with undef elements.
12998 NewMask.append(NumElts, -1);
12999
13000 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13001 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13002 NewMask);
13003 // This is free: ymm -> xmm.
13004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13005 DAG.getVectorIdxConstant(0, DL));
13006}
13007
13008/// Try to lower broadcast of a single element.
13009///
13010/// For convenience, this code also bundles all of the subtarget feature set
13011/// filtering. While a little annoying to re-dispatch on type here, there isn't
13012/// a convenient way to factor it out.
13014 SDValue V2, ArrayRef<int> Mask,
13015 const X86Subtarget &Subtarget,
13016 SelectionDAG &DAG) {
13017 MVT EltVT = VT.getVectorElementType();
13018 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13019 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13020 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13021 return SDValue();
13022
13023 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13024 // we can only broadcast from a register with AVX2.
13025 unsigned NumEltBits = VT.getScalarSizeInBits();
13026 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13029 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13030
13031 // Check that the mask is a broadcast.
13032 int BroadcastIdx = getSplatIndex(Mask);
13033 if (BroadcastIdx < 0) {
13034 // Check for hidden broadcast.
13035 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13036 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13037 return SDValue();
13038 BroadcastIdx = 0;
13039 }
13040 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13041 "a sorted mask where the broadcast "
13042 "comes from V1.");
13043 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13044
13045 // Go up the chain of (vector) values to find a scalar load that we can
13046 // combine with the broadcast.
13047 // TODO: Combine this logic with findEltLoadSrc() used by
13048 // EltsFromConsecutiveLoads().
13049 int BitOffset = BroadcastIdx * NumEltBits;
13050 SDValue V = V1;
13051 for (;;) {
13052 switch (V.getOpcode()) {
13053 case ISD::BITCAST: {
13054 V = V.getOperand(0);
13055 continue;
13056 }
13057 case ISD::CONCAT_VECTORS: {
13058 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13059 int OpIdx = BitOffset / OpBitWidth;
13060 V = V.getOperand(OpIdx);
13061 BitOffset %= OpBitWidth;
13062 continue;
13063 }
13065 // The extraction index adds to the existing offset.
13066 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13067 unsigned Idx = V.getConstantOperandVal(1);
13068 unsigned BeginOffset = Idx * EltBitWidth;
13069 BitOffset += BeginOffset;
13070 V = V.getOperand(0);
13071 continue;
13072 }
13073 case ISD::INSERT_SUBVECTOR: {
13074 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13075 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13076 int Idx = (int)V.getConstantOperandVal(2);
13077 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13078 int BeginOffset = Idx * EltBitWidth;
13079 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13080 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13081 BitOffset -= BeginOffset;
13082 V = VInner;
13083 } else {
13084 V = VOuter;
13085 }
13086 continue;
13087 }
13088 }
13089 break;
13090 }
13091 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13092 BroadcastIdx = BitOffset / NumEltBits;
13093
13094 // Do we need to bitcast the source to retrieve the original broadcast index?
13095 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13096
13097 // Check if this is a broadcast of a scalar. We special case lowering
13098 // for scalars so that we can more effectively fold with loads.
13099 // If the original value has a larger element type than the shuffle, the
13100 // broadcast element is in essence truncated. Make that explicit to ease
13101 // folding.
13102 if (BitCastSrc && VT.isInteger())
13103 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13104 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13105 return TruncBroadcast;
13106
13107 // Also check the simpler case, where we can directly reuse the scalar.
13108 if (!BitCastSrc &&
13109 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13110 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13111 V = V.getOperand(BroadcastIdx);
13112
13113 // If we can't broadcast from a register, check that the input is a load.
13114 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13115 return SDValue();
13116 } else if (ISD::isNormalLoad(V.getNode()) &&
13117 cast<LoadSDNode>(V)->isSimple()) {
13118 // We do not check for one-use of the vector load because a broadcast load
13119 // is expected to be a win for code size, register pressure, and possibly
13120 // uops even if the original vector load is not eliminated.
13121
13122 // Reduce the vector load and shuffle to a broadcasted scalar load.
13123 auto *Ld = cast<LoadSDNode>(V);
13124 SDValue BaseAddr = Ld->getBasePtr();
13125 MVT SVT = VT.getScalarType();
13126 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13127 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13128 SDValue NewAddr =
13130
13131 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13132 // than MOVDDUP.
13133 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13134 if (Opcode == X86ISD::VBROADCAST) {
13135 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13136 SDValue Ops[] = {Ld->getChain(), NewAddr};
13137 V = DAG.getMemIntrinsicNode(
13138 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13140 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13142 return DAG.getBitcast(VT, V);
13143 }
13144 assert(SVT == MVT::f64 && "Unexpected VT!");
13145 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13147 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13149 } else if (!BroadcastFromReg) {
13150 // We can't broadcast from a vector register.
13151 return SDValue();
13152 } else if (BitOffset != 0) {
13153 // We can only broadcast from the zero-element of a vector register,
13154 // but it can be advantageous to broadcast from the zero-element of a
13155 // subvector.
13156 if (!VT.is256BitVector() && !VT.is512BitVector())
13157 return SDValue();
13158
13159 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13160 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13161 return SDValue();
13162
13163 // If we are broadcasting an element from the lowest 128-bit subvector, try
13164 // to move the element in position.
13165 if (BitOffset < 128 && NumActiveElts > 1 &&
13166 V.getScalarValueSizeInBits() == NumEltBits) {
13167 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13168 "Unexpected bit-offset");
13169 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13170 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13171 V = extractSubVector(V, 0, DAG, DL, 128);
13172 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13173 } else {
13174 // Only broadcast the zero-element of a 128-bit subvector.
13175 if ((BitOffset % 128) != 0)
13176 return SDValue();
13177
13178 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13179 "Unexpected bit-offset");
13180 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13181 "Unexpected vector size");
13182 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13183 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13184 }
13185 }
13186
13187 // On AVX we can use VBROADCAST directly for scalar sources.
13188 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13189 V = DAG.getBitcast(MVT::f64, V);
13190 if (Subtarget.hasAVX()) {
13191 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13192 return DAG.getBitcast(VT, V);
13193 }
13194 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13195 }
13196
13197 // If this is a scalar, do the broadcast on this type and bitcast.
13198 if (!V.getValueType().isVector()) {
13199 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13200 "Unexpected scalar size");
13201 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13203 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13204 }
13205
13206 // We only support broadcasting from 128-bit vectors to minimize the
13207 // number of patterns we need to deal with in isel. So extract down to
13208 // 128-bits, removing as many bitcasts as possible.
13209 if (V.getValueSizeInBits() > 128)
13211
13212 // Otherwise cast V to a vector with the same element type as VT, but
13213 // possibly narrower than VT. Then perform the broadcast.
13214 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13215 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13216 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13217}
13218
13219// Check for whether we can use INSERTPS to perform the shuffle. We only use
13220// INSERTPS when the V1 elements are already in the correct locations
13221// because otherwise we can just always use two SHUFPS instructions which
13222// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13223// perform INSERTPS if a single V1 element is out of place and all V2
13224// elements are zeroable.
13226 unsigned &InsertPSMask,
13227 const APInt &Zeroable,
13228 ArrayRef<int> Mask, SelectionDAG &DAG) {
13229 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13230 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13231 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13232
13233 // Attempt to match INSERTPS with one element from VA or VB being
13234 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13235 // are updated.
13236 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13237 ArrayRef<int> CandidateMask) {
13238 unsigned ZMask = 0;
13239 int VADstIndex = -1;
13240 int VBDstIndex = -1;
13241 bool VAUsedInPlace = false;
13242
13243 for (int i = 0; i < 4; ++i) {
13244 // Synthesize a zero mask from the zeroable elements (includes undefs).
13245 if (Zeroable[i]) {
13246 ZMask |= 1 << i;
13247 continue;
13248 }
13249
13250 // Flag if we use any VA inputs in place.
13251 if (i == CandidateMask[i]) {
13252 VAUsedInPlace = true;
13253 continue;
13254 }
13255
13256 // We can only insert a single non-zeroable element.
13257 if (VADstIndex >= 0 || VBDstIndex >= 0)
13258 return false;
13259
13260 if (CandidateMask[i] < 4) {
13261 // VA input out of place for insertion.
13262 VADstIndex = i;
13263 } else {
13264 // VB input for insertion.
13265 VBDstIndex = i;
13266 }
13267 }
13268
13269 // Don't bother if we have no (non-zeroable) element for insertion.
13270 if (VADstIndex < 0 && VBDstIndex < 0)
13271 return false;
13272
13273 // Determine element insertion src/dst indices. The src index is from the
13274 // start of the inserted vector, not the start of the concatenated vector.
13275 unsigned VBSrcIndex = 0;
13276 if (VADstIndex >= 0) {
13277 // If we have a VA input out of place, we use VA as the V2 element
13278 // insertion and don't use the original V2 at all.
13279 VBSrcIndex = CandidateMask[VADstIndex];
13280 VBDstIndex = VADstIndex;
13281 VB = VA;
13282 } else {
13283 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13284 }
13285
13286 // If no V1 inputs are used in place, then the result is created only from
13287 // the zero mask and the V2 insertion - so remove V1 dependency.
13288 if (!VAUsedInPlace)
13289 VA = DAG.getUNDEF(MVT::v4f32);
13290
13291 // Update V1, V2 and InsertPSMask accordingly.
13292 V1 = VA;
13293 V2 = VB;
13294
13295 // Insert the V2 element into the desired position.
13296 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13297 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13298 return true;
13299 };
13300
13301 if (matchAsInsertPS(V1, V2, Mask))
13302 return true;
13303
13304 // Commute and try again.
13305 SmallVector<int, 4> CommutedMask(Mask);
13307 if (matchAsInsertPS(V2, V1, CommutedMask))
13308 return true;
13309
13310 return false;
13311}
13312
13314 ArrayRef<int> Mask, const APInt &Zeroable,
13315 SelectionDAG &DAG) {
13316 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13317 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13318
13319 // Attempt to match the insertps pattern.
13320 unsigned InsertPSMask = 0;
13321 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13322 return SDValue();
13323
13324 // Insert the V2 element into the desired position.
13325 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13326 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13327}
13328
13329/// Handle lowering of 2-lane 64-bit floating point shuffles.
13330///
13331/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13332/// support for floating point shuffles but not integer shuffles. These
13333/// instructions will incur a domain crossing penalty on some chips though so
13334/// it is better to avoid lowering through this for integer vectors where
13335/// possible.
13337 const APInt &Zeroable, SDValue V1, SDValue V2,
13338 const X86Subtarget &Subtarget,
13339 SelectionDAG &DAG) {
13340 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13341 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13342 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13343
13344 if (V2.isUndef()) {
13345 // Check for being able to broadcast a single element.
13346 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13347 Mask, Subtarget, DAG))
13348 return Broadcast;
13349
13350 // Straight shuffle of a single input vector. Simulate this by using the
13351 // single input as both of the "inputs" to this instruction..
13352 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13353
13354 if (Subtarget.hasAVX()) {
13355 // If we have AVX, we can use VPERMILPS which will allow folding a load
13356 // into the shuffle.
13357 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13358 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13359 }
13360
13361 return DAG.getNode(
13362 X86ISD::SHUFP, DL, MVT::v2f64,
13363 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13364 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13365 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13366 }
13367 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13368 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13369 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13370 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13371
13372 if (Subtarget.hasAVX2())
13373 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13374 return Extract;
13375
13376 // When loading a scalar and then shuffling it into a vector we can often do
13377 // the insertion cheaply.
13379 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13380 return Insertion;
13381 // Try inverting the insertion since for v2 masks it is easy to do and we
13382 // can't reliably sort the mask one way or the other.
13383 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13384 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13386 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13387 return Insertion;
13388
13389 // Try to use one of the special instruction patterns to handle two common
13390 // blend patterns if a zero-blend above didn't work.
13391 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13392 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13393 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13394 // We can either use a special instruction to load over the low double or
13395 // to move just the low double.
13396 return DAG.getNode(
13397 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13398 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13399
13400 if (Subtarget.hasSSE41())
13401 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13402 Zeroable, Subtarget, DAG))
13403 return Blend;
13404
13405 // Use dedicated unpack instructions for masks that match their pattern.
13406 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13407 return V;
13408
13409 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13410 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13411 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13412}
13413
13414/// Handle lowering of 2-lane 64-bit integer shuffles.
13415///
13416/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13417/// the integer unit to minimize domain crossing penalties. However, for blends
13418/// it falls back to the floating point shuffle operation with appropriate bit
13419/// casting.
13421 const APInt &Zeroable, SDValue V1, SDValue V2,
13422 const X86Subtarget &Subtarget,
13423 SelectionDAG &DAG) {
13424 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13425 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13426 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13427
13428 if (V2.isUndef()) {
13429 // Check for being able to broadcast a single element.
13430 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13431 Mask, Subtarget, DAG))
13432 return Broadcast;
13433
13434 // Straight shuffle of a single input vector. For everything from SSE2
13435 // onward this has a single fast instruction with no scary immediates.
13436 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13437 V1 = DAG.getBitcast(MVT::v4i32, V1);
13438 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13439 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13440 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13441 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13442 return DAG.getBitcast(
13443 MVT::v2i64,
13444 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13445 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13446 }
13447 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13448 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13449 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13450 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13451
13452 if (Subtarget.hasAVX2())
13453 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13454 return Extract;
13455
13456 // Try to use shift instructions.
13457 if (SDValue Shift =
13458 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13459 DAG, /*BitwiseOnly*/ false))
13460 return Shift;
13461
13462 // When loading a scalar and then shuffling it into a vector we can often do
13463 // the insertion cheaply.
13465 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13466 return Insertion;
13467 // Try inverting the insertion since for v2 masks it is easy to do and we
13468 // can't reliably sort the mask one way or the other.
13469 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13471 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13472 return Insertion;
13473
13474 // We have different paths for blend lowering, but they all must use the
13475 // *exact* same predicate.
13476 bool IsBlendSupported = Subtarget.hasSSE41();
13477 if (IsBlendSupported)
13478 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13479 Zeroable, Subtarget, DAG))
13480 return Blend;
13481
13482 // Use dedicated unpack instructions for masks that match their pattern.
13483 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13484 return V;
13485
13486 // Try to use byte rotation instructions.
13487 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13488 if (Subtarget.hasSSSE3()) {
13489 if (Subtarget.hasVLX())
13490 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13491 Zeroable, Subtarget, DAG))
13492 return Rotate;
13493
13494 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13495 Subtarget, DAG))
13496 return Rotate;
13497 }
13498
13499 // If we have direct support for blends, we should lower by decomposing into
13500 // a permute. That will be faster than the domain cross.
13501 if (IsBlendSupported)
13502 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13503 Zeroable, Subtarget, DAG);
13504
13505 // We implement this with SHUFPD which is pretty lame because it will likely
13506 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13507 // However, all the alternatives are still more cycles and newer chips don't
13508 // have this problem. It would be really nice if x86 had better shuffles here.
13509 V1 = DAG.getBitcast(MVT::v2f64, V1);
13510 V2 = DAG.getBitcast(MVT::v2f64, V2);
13511 return DAG.getBitcast(MVT::v2i64,
13512 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13513}
13514
13515/// Lower a vector shuffle using the SHUFPS instruction.
13516///
13517/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13518/// It makes no assumptions about whether this is the *best* lowering, it simply
13519/// uses it.
13521 ArrayRef<int> Mask, SDValue V1,
13522 SDValue V2, SelectionDAG &DAG) {
13523 SDValue LowV = V1, HighV = V2;
13524 SmallVector<int, 4> NewMask(Mask);
13525 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13526
13527 if (NumV2Elements == 1) {
13528 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13529
13530 // Compute the index adjacent to V2Index and in the same half by toggling
13531 // the low bit.
13532 int V2AdjIndex = V2Index ^ 1;
13533
13534 if (Mask[V2AdjIndex] < 0) {
13535 // Handles all the cases where we have a single V2 element and an undef.
13536 // This will only ever happen in the high lanes because we commute the
13537 // vector otherwise.
13538 if (V2Index < 2)
13539 std::swap(LowV, HighV);
13540 NewMask[V2Index] -= 4;
13541 } else {
13542 // Handle the case where the V2 element ends up adjacent to a V1 element.
13543 // To make this work, blend them together as the first step.
13544 int V1Index = V2AdjIndex;
13545 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13546 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13547 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13548
13549 // Now proceed to reconstruct the final blend as we have the necessary
13550 // high or low half formed.
13551 if (V2Index < 2) {
13552 LowV = V2;
13553 HighV = V1;
13554 } else {
13555 HighV = V2;
13556 }
13557 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13558 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13559 }
13560 } else if (NumV2Elements == 2) {
13561 if (Mask[0] < 4 && Mask[1] < 4) {
13562 // Handle the easy case where we have V1 in the low lanes and V2 in the
13563 // high lanes.
13564 NewMask[2] -= 4;
13565 NewMask[3] -= 4;
13566 } else if (Mask[2] < 4 && Mask[3] < 4) {
13567 // We also handle the reversed case because this utility may get called
13568 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13569 // arrange things in the right direction.
13570 NewMask[0] -= 4;
13571 NewMask[1] -= 4;
13572 HighV = V1;
13573 LowV = V2;
13574 } else {
13575 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13576 // trying to place elements directly, just blend them and set up the final
13577 // shuffle to place them.
13578
13579 // The first two blend mask elements are for V1, the second two are for
13580 // V2.
13581 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13582 Mask[2] < 4 ? Mask[2] : Mask[3],
13583 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13584 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13585 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13586 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13587
13588 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13589 // a blend.
13590 LowV = HighV = V1;
13591 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13592 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13593 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13594 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13595 }
13596 } else if (NumV2Elements == 3) {
13597 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13598 // we can get here due to other paths (e.g repeated mask matching) that we
13599 // don't want to do another round of lowerVECTOR_SHUFFLE.
13601 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13602 }
13603 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13604 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13605}
13606
13607/// Lower 4-lane 32-bit floating point shuffles.
13608///
13609/// Uses instructions exclusively from the floating point unit to minimize
13610/// domain crossing penalties, as these are sufficient to implement all v4f32
13611/// shuffles.
13613 const APInt &Zeroable, SDValue V1, SDValue V2,
13614 const X86Subtarget &Subtarget,
13615 SelectionDAG &DAG) {
13616 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13617 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13618 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13619
13620 if (Subtarget.hasSSE41())
13621 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13622 Zeroable, Subtarget, DAG))
13623 return Blend;
13624
13625 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13626
13627 if (NumV2Elements == 0) {
13628 // Check for being able to broadcast a single element.
13629 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13630 Mask, Subtarget, DAG))
13631 return Broadcast;
13632
13633 // Use even/odd duplicate instructions for masks that match their pattern.
13634 if (Subtarget.hasSSE3()) {
13635 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13636 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13637 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13638 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13639 }
13640
13641 if (Subtarget.hasAVX()) {
13642 // If we have AVX, we can use VPERMILPS which will allow folding a load
13643 // into the shuffle.
13644 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13645 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13646 }
13647
13648 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13649 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13650 if (!Subtarget.hasSSE2()) {
13651 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13652 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13653 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13654 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13655 }
13656
13657 // Otherwise, use a straight shuffle of a single input vector. We pass the
13658 // input vector to both operands to simulate this with a SHUFPS.
13659 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13660 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13661 }
13662
13663 if (Subtarget.hasSSE2())
13665 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13666 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13667 return ZExt;
13668 }
13669
13670 if (Subtarget.hasAVX2())
13671 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13672 return Extract;
13673
13674 // There are special ways we can lower some single-element blends. However, we
13675 // have custom ways we can lower more complex single-element blends below that
13676 // we defer to if both this and BLENDPS fail to match, so restrict this to
13677 // when the V2 input is targeting element 0 of the mask -- that is the fast
13678 // case here.
13679 if (NumV2Elements == 1 && Mask[0] >= 4)
13681 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13682 return V;
13683
13684 if (Subtarget.hasSSE41()) {
13685 // Use INSERTPS if we can complete the shuffle efficiently.
13686 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13687 return V;
13688
13689 if (!isSingleSHUFPSMask(Mask))
13690 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13691 V2, Mask, DAG))
13692 return BlendPerm;
13693 }
13694
13695 // Use low/high mov instructions. These are only valid in SSE1 because
13696 // otherwise they are widened to v2f64 and never get here.
13697 if (!Subtarget.hasSSE2()) {
13698 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13699 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13700 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13701 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13702 }
13703
13704 // Use dedicated unpack instructions for masks that match their pattern.
13705 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13706 return V;
13707
13708 // Otherwise fall back to a SHUFPS lowering strategy.
13709 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13710}
13711
13712/// Lower 4-lane i32 vector shuffles.
13713///
13714/// We try to handle these with integer-domain shuffles where we can, but for
13715/// blends we use the floating point domain blend instructions.
13717 const APInt &Zeroable, SDValue V1, SDValue V2,
13718 const X86Subtarget &Subtarget,
13719 SelectionDAG &DAG) {
13720 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13721 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13722 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13723
13724 // Whenever we can lower this as a zext, that instruction is strictly faster
13725 // than any alternative. It also allows us to fold memory operands into the
13726 // shuffle in many cases.
13727 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13728 Zeroable, Subtarget, DAG))
13729 return ZExt;
13730
13731 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13732
13733 // Try to use shift instructions if fast.
13734 if (Subtarget.preferLowerShuffleAsShift()) {
13735 if (SDValue Shift =
13736 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13737 Subtarget, DAG, /*BitwiseOnly*/ true))
13738 return Shift;
13739 if (NumV2Elements == 0)
13740 if (SDValue Rotate =
13741 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13742 return Rotate;
13743 }
13744
13745 if (NumV2Elements == 0) {
13746 // Try to use broadcast unless the mask only has one non-undef element.
13747 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13748 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13749 Mask, Subtarget, DAG))
13750 return Broadcast;
13751 }
13752
13753 // Straight shuffle of a single input vector. For everything from SSE2
13754 // onward this has a single fast instruction with no scary immediates.
13755 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13756 // but we aren't actually going to use the UNPCK instruction because doing
13757 // so prevents folding a load into this instruction or making a copy.
13758 const int UnpackLoMask[] = {0, 0, 1, 1};
13759 const int UnpackHiMask[] = {2, 2, 3, 3};
13760 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13761 Mask = UnpackLoMask;
13762 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13763 Mask = UnpackHiMask;
13764
13765 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13766 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13767 }
13768
13769 if (Subtarget.hasAVX2())
13770 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13771 return Extract;
13772
13773 // Try to use shift instructions.
13774 if (SDValue Shift =
13775 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13776 DAG, /*BitwiseOnly*/ false))
13777 return Shift;
13778
13779 // There are special ways we can lower some single-element blends.
13780 if (NumV2Elements == 1)
13782 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13783 return V;
13784
13785 // We have different paths for blend lowering, but they all must use the
13786 // *exact* same predicate.
13787 bool IsBlendSupported = Subtarget.hasSSE41();
13788 if (IsBlendSupported)
13789 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13790 Zeroable, Subtarget, DAG))
13791 return Blend;
13792
13793 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13794 Zeroable, Subtarget, DAG))
13795 return Masked;
13796
13797 // Use dedicated unpack instructions for masks that match their pattern.
13798 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13799 return V;
13800
13801 // Try to use byte rotation instructions.
13802 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13803 if (Subtarget.hasSSSE3()) {
13804 if (Subtarget.hasVLX())
13805 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13806 Zeroable, Subtarget, DAG))
13807 return Rotate;
13808
13809 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13810 Subtarget, DAG))
13811 return Rotate;
13812 }
13813
13814 // Assume that a single SHUFPS is faster than an alternative sequence of
13815 // multiple instructions (even if the CPU has a domain penalty).
13816 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13817 if (!isSingleSHUFPSMask(Mask)) {
13818 // If we have direct support for blends, we should lower by decomposing into
13819 // a permute. That will be faster than the domain cross.
13820 if (IsBlendSupported)
13821 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13822 Zeroable, Subtarget, DAG);
13823
13824 // Try to lower by permuting the inputs into an unpack instruction.
13825 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13826 Mask, Subtarget, DAG))
13827 return Unpack;
13828 }
13829
13830 // We implement this with SHUFPS because it can blend from two vectors.
13831 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13832 // up the inputs, bypassing domain shift penalties that we would incur if we
13833 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13834 // relevant.
13835 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13836 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13837 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13838 return DAG.getBitcast(MVT::v4i32, ShufPS);
13839}
13840
13841/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13842/// shuffle lowering, and the most complex part.
13843///
13844/// The lowering strategy is to try to form pairs of input lanes which are
13845/// targeted at the same half of the final vector, and then use a dword shuffle
13846/// to place them onto the right half, and finally unpack the paired lanes into
13847/// their final position.
13848///
13849/// The exact breakdown of how to form these dword pairs and align them on the
13850/// correct sides is really tricky. See the comments within the function for
13851/// more of the details.
13852///
13853/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13854/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13855/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13856/// vector, form the analogous 128-bit 8-element Mask.
13858 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13859 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13860 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13861 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13862
13863 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13864 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13865 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13866
13867 // Attempt to directly match PSHUFLW or PSHUFHW.
13868 if (isUndefOrInRange(LoMask, 0, 4) &&
13869 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13870 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13871 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13872 }
13873 if (isUndefOrInRange(HiMask, 4, 8) &&
13874 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13875 for (int i = 0; i != 4; ++i)
13876 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13877 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13878 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13879 }
13880
13881 SmallVector<int, 4> LoInputs;
13882 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13883 array_pod_sort(LoInputs.begin(), LoInputs.end());
13884 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13885 SmallVector<int, 4> HiInputs;
13886 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13887 array_pod_sort(HiInputs.begin(), HiInputs.end());
13888 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13889 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13890 int NumHToL = LoInputs.size() - NumLToL;
13891 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13892 int NumHToH = HiInputs.size() - NumLToH;
13893 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13894 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13895 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13896 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13897
13898 // If we are shuffling values from one half - check how many different DWORD
13899 // pairs we need to create. If only 1 or 2 then we can perform this as a
13900 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13901 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13902 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13903 V = DAG.getNode(ShufWOp, DL, VT, V,
13904 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13905 V = DAG.getBitcast(PSHUFDVT, V);
13906 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13907 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13908 return DAG.getBitcast(VT, V);
13909 };
13910
13911 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13912 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13913 SmallVector<std::pair<int, int>, 4> DWordPairs;
13914 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13915
13916 // Collect the different DWORD pairs.
13917 for (int DWord = 0; DWord != 4; ++DWord) {
13918 int M0 = Mask[2 * DWord + 0];
13919 int M1 = Mask[2 * DWord + 1];
13920 M0 = (M0 >= 0 ? M0 % 4 : M0);
13921 M1 = (M1 >= 0 ? M1 % 4 : M1);
13922 if (M0 < 0 && M1 < 0)
13923 continue;
13924
13925 bool Match = false;
13926 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13927 auto &DWordPair = DWordPairs[j];
13928 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13929 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13930 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13931 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13932 PSHUFDMask[DWord] = DOffset + j;
13933 Match = true;
13934 break;
13935 }
13936 }
13937 if (!Match) {
13938 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13939 DWordPairs.push_back(std::make_pair(M0, M1));
13940 }
13941 }
13942
13943 if (DWordPairs.size() <= 2) {
13944 DWordPairs.resize(2, std::make_pair(-1, -1));
13945 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13946 DWordPairs[1].first, DWordPairs[1].second};
13947 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13948 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13949 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13950 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13951 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13952 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13953 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13954 }
13955 if ((NumHToL + NumHToH) == 0)
13956 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13957 if ((NumLToL + NumLToH) == 0)
13958 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13959 }
13960 }
13961
13962 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13963 // such inputs we can swap two of the dwords across the half mark and end up
13964 // with <=2 inputs to each half in each half. Once there, we can fall through
13965 // to the generic code below. For example:
13966 //
13967 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13968 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13969 //
13970 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13971 // and an existing 2-into-2 on the other half. In this case we may have to
13972 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13973 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13974 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13975 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13976 // half than the one we target for fixing) will be fixed when we re-enter this
13977 // path. We will also combine away any sequence of PSHUFD instructions that
13978 // result into a single instruction. Here is an example of the tricky case:
13979 //
13980 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13981 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13982 //
13983 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13984 //
13985 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13986 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13987 //
13988 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13989 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13990 //
13991 // The result is fine to be handled by the generic logic.
13992 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13993 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13994 int AOffset, int BOffset) {
13995 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13996 "Must call this with A having 3 or 1 inputs from the A half.");
13997 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13998 "Must call this with B having 1 or 3 inputs from the B half.");
13999 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14000 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14001
14002 bool ThreeAInputs = AToAInputs.size() == 3;
14003
14004 // Compute the index of dword with only one word among the three inputs in
14005 // a half by taking the sum of the half with three inputs and subtracting
14006 // the sum of the actual three inputs. The difference is the remaining
14007 // slot.
14008 int ADWord = 0, BDWord = 0;
14009 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14010 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14011 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14012 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14013 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14014 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14015 int TripleNonInputIdx =
14016 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14017 TripleDWord = TripleNonInputIdx / 2;
14018
14019 // We use xor with one to compute the adjacent DWord to whichever one the
14020 // OneInput is in.
14021 OneInputDWord = (OneInput / 2) ^ 1;
14022
14023 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14024 // and BToA inputs. If there is also such a problem with the BToB and AToB
14025 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14026 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14027 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14028 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14029 // Compute how many inputs will be flipped by swapping these DWords. We
14030 // need
14031 // to balance this to ensure we don't form a 3-1 shuffle in the other
14032 // half.
14033 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14034 llvm::count(AToBInputs, 2 * ADWord + 1);
14035 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14036 llvm::count(BToBInputs, 2 * BDWord + 1);
14037 if ((NumFlippedAToBInputs == 1 &&
14038 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14039 (NumFlippedBToBInputs == 1 &&
14040 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14041 // We choose whether to fix the A half or B half based on whether that
14042 // half has zero flipped inputs. At zero, we may not be able to fix it
14043 // with that half. We also bias towards fixing the B half because that
14044 // will more commonly be the high half, and we have to bias one way.
14045 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14046 ArrayRef<int> Inputs) {
14047 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14048 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14049 // Determine whether the free index is in the flipped dword or the
14050 // unflipped dword based on where the pinned index is. We use this bit
14051 // in an xor to conditionally select the adjacent dword.
14052 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14053 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14054 if (IsFixIdxInput == IsFixFreeIdxInput)
14055 FixFreeIdx += 1;
14056 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14057 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14058 "We need to be changing the number of flipped inputs!");
14059 int PSHUFHalfMask[] = {0, 1, 2, 3};
14060 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14061 V = DAG.getNode(
14062 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14063 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14064 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14065
14066 for (int &M : Mask)
14067 if (M >= 0 && M == FixIdx)
14068 M = FixFreeIdx;
14069 else if (M >= 0 && M == FixFreeIdx)
14070 M = FixIdx;
14071 };
14072 if (NumFlippedBToBInputs != 0) {
14073 int BPinnedIdx =
14074 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14075 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14076 } else {
14077 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14078 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14079 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14080 }
14081 }
14082 }
14083
14084 int PSHUFDMask[] = {0, 1, 2, 3};
14085 PSHUFDMask[ADWord] = BDWord;
14086 PSHUFDMask[BDWord] = ADWord;
14087 V = DAG.getBitcast(
14088 VT,
14089 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14090 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14091
14092 // Adjust the mask to match the new locations of A and B.
14093 for (int &M : Mask)
14094 if (M >= 0 && M/2 == ADWord)
14095 M = 2 * BDWord + M % 2;
14096 else if (M >= 0 && M/2 == BDWord)
14097 M = 2 * ADWord + M % 2;
14098
14099 // Recurse back into this routine to re-compute state now that this isn't
14100 // a 3 and 1 problem.
14101 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14102 };
14103 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14104 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14105 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14106 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14107
14108 // At this point there are at most two inputs to the low and high halves from
14109 // each half. That means the inputs can always be grouped into dwords and
14110 // those dwords can then be moved to the correct half with a dword shuffle.
14111 // We use at most one low and one high word shuffle to collect these paired
14112 // inputs into dwords, and finally a dword shuffle to place them.
14113 int PSHUFLMask[4] = {-1, -1, -1, -1};
14114 int PSHUFHMask[4] = {-1, -1, -1, -1};
14115 int PSHUFDMask[4] = {-1, -1, -1, -1};
14116
14117 // First fix the masks for all the inputs that are staying in their
14118 // original halves. This will then dictate the targets of the cross-half
14119 // shuffles.
14120 auto fixInPlaceInputs =
14121 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14122 MutableArrayRef<int> SourceHalfMask,
14123 MutableArrayRef<int> HalfMask, int HalfOffset) {
14124 if (InPlaceInputs.empty())
14125 return;
14126 if (InPlaceInputs.size() == 1) {
14127 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14128 InPlaceInputs[0] - HalfOffset;
14129 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14130 return;
14131 }
14132 if (IncomingInputs.empty()) {
14133 // Just fix all of the in place inputs.
14134 for (int Input : InPlaceInputs) {
14135 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14136 PSHUFDMask[Input / 2] = Input / 2;
14137 }
14138 return;
14139 }
14140
14141 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14142 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14143 InPlaceInputs[0] - HalfOffset;
14144 // Put the second input next to the first so that they are packed into
14145 // a dword. We find the adjacent index by toggling the low bit.
14146 int AdjIndex = InPlaceInputs[0] ^ 1;
14147 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14148 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14149 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14150 };
14151 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14152 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14153
14154 // Now gather the cross-half inputs and place them into a free dword of
14155 // their target half.
14156 // FIXME: This operation could almost certainly be simplified dramatically to
14157 // look more like the 3-1 fixing operation.
14158 auto moveInputsToRightHalf = [&PSHUFDMask](
14159 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14160 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14161 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14162 int DestOffset) {
14163 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14164 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14165 };
14166 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14167 int Word) {
14168 int LowWord = Word & ~1;
14169 int HighWord = Word | 1;
14170 return isWordClobbered(SourceHalfMask, LowWord) ||
14171 isWordClobbered(SourceHalfMask, HighWord);
14172 };
14173
14174 if (IncomingInputs.empty())
14175 return;
14176
14177 if (ExistingInputs.empty()) {
14178 // Map any dwords with inputs from them into the right half.
14179 for (int Input : IncomingInputs) {
14180 // If the source half mask maps over the inputs, turn those into
14181 // swaps and use the swapped lane.
14182 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14183 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14184 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14185 Input - SourceOffset;
14186 // We have to swap the uses in our half mask in one sweep.
14187 for (int &M : HalfMask)
14188 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14189 M = Input;
14190 else if (M == Input)
14191 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14192 } else {
14193 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14194 Input - SourceOffset &&
14195 "Previous placement doesn't match!");
14196 }
14197 // Note that this correctly re-maps both when we do a swap and when
14198 // we observe the other side of the swap above. We rely on that to
14199 // avoid swapping the members of the input list directly.
14200 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14201 }
14202
14203 // Map the input's dword into the correct half.
14204 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14205 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14206 else
14207 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14208 Input / 2 &&
14209 "Previous placement doesn't match!");
14210 }
14211
14212 // And just directly shift any other-half mask elements to be same-half
14213 // as we will have mirrored the dword containing the element into the
14214 // same position within that half.
14215 for (int &M : HalfMask)
14216 if (M >= SourceOffset && M < SourceOffset + 4) {
14217 M = M - SourceOffset + DestOffset;
14218 assert(M >= 0 && "This should never wrap below zero!");
14219 }
14220 return;
14221 }
14222
14223 // Ensure we have the input in a viable dword of its current half. This
14224 // is particularly tricky because the original position may be clobbered
14225 // by inputs being moved and *staying* in that half.
14226 if (IncomingInputs.size() == 1) {
14227 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14228 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14229 SourceOffset;
14230 SourceHalfMask[InputFixed - SourceOffset] =
14231 IncomingInputs[0] - SourceOffset;
14232 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14233 IncomingInputs[0] = InputFixed;
14234 }
14235 } else if (IncomingInputs.size() == 2) {
14236 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14237 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14238 // We have two non-adjacent or clobbered inputs we need to extract from
14239 // the source half. To do this, we need to map them into some adjacent
14240 // dword slot in the source mask.
14241 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14242 IncomingInputs[1] - SourceOffset};
14243
14244 // If there is a free slot in the source half mask adjacent to one of
14245 // the inputs, place the other input in it. We use (Index XOR 1) to
14246 // compute an adjacent index.
14247 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14248 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14249 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14250 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14251 InputsFixed[1] = InputsFixed[0] ^ 1;
14252 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14253 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14254 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14255 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14256 InputsFixed[0] = InputsFixed[1] ^ 1;
14257 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14258 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14259 // The two inputs are in the same DWord but it is clobbered and the
14260 // adjacent DWord isn't used at all. Move both inputs to the free
14261 // slot.
14262 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14263 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14264 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14265 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14266 } else {
14267 // The only way we hit this point is if there is no clobbering
14268 // (because there are no off-half inputs to this half) and there is no
14269 // free slot adjacent to one of the inputs. In this case, we have to
14270 // swap an input with a non-input.
14271 for (int i = 0; i < 4; ++i)
14272 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14273 "We can't handle any clobbers here!");
14274 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14275 "Cannot have adjacent inputs here!");
14276
14277 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14278 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14279
14280 // We also have to update the final source mask in this case because
14281 // it may need to undo the above swap.
14282 for (int &M : FinalSourceHalfMask)
14283 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14284 M = InputsFixed[1] + SourceOffset;
14285 else if (M == InputsFixed[1] + SourceOffset)
14286 M = (InputsFixed[0] ^ 1) + SourceOffset;
14287
14288 InputsFixed[1] = InputsFixed[0] ^ 1;
14289 }
14290
14291 // Point everything at the fixed inputs.
14292 for (int &M : HalfMask)
14293 if (M == IncomingInputs[0])
14294 M = InputsFixed[0] + SourceOffset;
14295 else if (M == IncomingInputs[1])
14296 M = InputsFixed[1] + SourceOffset;
14297
14298 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14299 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14300 }
14301 } else {
14302 llvm_unreachable("Unhandled input size!");
14303 }
14304
14305 // Now hoist the DWord down to the right half.
14306 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14307 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14308 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14309 for (int &M : HalfMask)
14310 for (int Input : IncomingInputs)
14311 if (M == Input)
14312 M = FreeDWord * 2 + Input % 2;
14313 };
14314 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14315 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14316 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14317 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14318
14319 // Now enact all the shuffles we've computed to move the inputs into their
14320 // target half.
14321 if (!isNoopShuffleMask(PSHUFLMask))
14322 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14323 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14324 if (!isNoopShuffleMask(PSHUFHMask))
14325 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14326 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14327 if (!isNoopShuffleMask(PSHUFDMask))
14328 V = DAG.getBitcast(
14329 VT,
14330 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14331 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14332
14333 // At this point, each half should contain all its inputs, and we can then
14334 // just shuffle them into their final position.
14335 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14336 "Failed to lift all the high half inputs to the low mask!");
14337 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14338 "Failed to lift all the low half inputs to the high mask!");
14339
14340 // Do a half shuffle for the low mask.
14341 if (!isNoopShuffleMask(LoMask))
14342 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14343 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14344
14345 // Do a half shuffle with the high mask after shifting its values down.
14346 for (int &M : HiMask)
14347 if (M >= 0)
14348 M -= 4;
14349 if (!isNoopShuffleMask(HiMask))
14350 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14351 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14352
14353 return V;
14354}
14355
14356/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14357/// blend if only one input is used.
14359 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14360 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14362 "Lane crossing shuffle masks not supported");
14363
14364 int NumBytes = VT.getSizeInBits() / 8;
14365 int Size = Mask.size();
14366 int Scale = NumBytes / Size;
14367
14368 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14369 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14370 V1InUse = false;
14371 V2InUse = false;
14372
14373 for (int i = 0; i < NumBytes; ++i) {
14374 int M = Mask[i / Scale];
14375 if (M < 0)
14376 continue;
14377
14378 const int ZeroMask = 0x80;
14379 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14380 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14381 if (Zeroable[i / Scale])
14382 V1Idx = V2Idx = ZeroMask;
14383
14384 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14385 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14386 V1InUse |= (ZeroMask != V1Idx);
14387 V2InUse |= (ZeroMask != V2Idx);
14388 }
14389
14390 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14391 if (V1InUse)
14392 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14393 DAG.getBuildVector(ShufVT, DL, V1Mask));
14394 if (V2InUse)
14395 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14396 DAG.getBuildVector(ShufVT, DL, V2Mask));
14397
14398 // If we need shuffled inputs from both, blend the two.
14399 SDValue V;
14400 if (V1InUse && V2InUse)
14401 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14402 else
14403 V = V1InUse ? V1 : V2;
14404
14405 // Cast the result back to the correct type.
14406 return DAG.getBitcast(VT, V);
14407}
14408
14409/// Generic lowering of 8-lane i16 shuffles.
14410///
14411/// This handles both single-input shuffles and combined shuffle/blends with
14412/// two inputs. The single input shuffles are immediately delegated to
14413/// a dedicated lowering routine.
14414///
14415/// The blends are lowered in one of three fundamental ways. If there are few
14416/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14417/// of the input is significantly cheaper when lowered as an interleaving of
14418/// the two inputs, try to interleave them. Otherwise, blend the low and high
14419/// halves of the inputs separately (making them have relatively few inputs)
14420/// and then concatenate them.
14422 const APInt &Zeroable, SDValue V1, SDValue V2,
14423 const X86Subtarget &Subtarget,
14424 SelectionDAG &DAG) {
14425 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14426 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14427 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14428
14429 // Whenever we can lower this as a zext, that instruction is strictly faster
14430 // than any alternative.
14431 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14432 Zeroable, Subtarget, DAG))
14433 return ZExt;
14434
14435 // Try to use lower using a truncation.
14436 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14437 Subtarget, DAG))
14438 return V;
14439
14440 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14441
14442 if (NumV2Inputs == 0) {
14443 // Try to use shift instructions.
14444 if (SDValue Shift =
14445 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14446 Subtarget, DAG, /*BitwiseOnly*/ false))
14447 return Shift;
14448
14449 // Check for being able to broadcast a single element.
14450 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14451 Mask, Subtarget, DAG))
14452 return Broadcast;
14453
14454 // Try to use bit rotation instructions.
14455 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14456 Subtarget, DAG))
14457 return Rotate;
14458
14459 // Use dedicated unpack instructions for masks that match their pattern.
14460 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14461 return V;
14462
14463 // Use dedicated pack instructions for masks that match their pattern.
14464 if (SDValue V =
14465 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14466 return V;
14467
14468 // Try to use byte rotation instructions.
14469 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14470 Subtarget, DAG))
14471 return Rotate;
14472
14473 // Make a copy of the mask so it can be modified.
14474 SmallVector<int, 8> MutableMask(Mask);
14475 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14476 Subtarget, DAG);
14477 }
14478
14479 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14480 "All single-input shuffles should be canonicalized to be V1-input "
14481 "shuffles.");
14482
14483 // Try to use shift instructions.
14484 if (SDValue Shift =
14485 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14486 DAG, /*BitwiseOnly*/ false))
14487 return Shift;
14488
14489 // See if we can use SSE4A Extraction / Insertion.
14490 if (Subtarget.hasSSE4A())
14491 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14492 Zeroable, DAG))
14493 return V;
14494
14495 // There are special ways we can lower some single-element blends.
14496 if (NumV2Inputs == 1)
14498 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14499 return V;
14500
14501 // We have different paths for blend lowering, but they all must use the
14502 // *exact* same predicate.
14503 bool IsBlendSupported = Subtarget.hasSSE41();
14504 if (IsBlendSupported)
14505 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14506 Zeroable, Subtarget, DAG))
14507 return Blend;
14508
14509 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14510 Zeroable, Subtarget, DAG))
14511 return Masked;
14512
14513 // Use dedicated unpack instructions for masks that match their pattern.
14514 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14515 return V;
14516
14517 // Use dedicated pack instructions for masks that match their pattern.
14518 if (SDValue V =
14519 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14520 return V;
14521
14522 // Try to use lower using a truncation.
14523 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14524 Subtarget, DAG))
14525 return V;
14526
14527 // Try to use byte rotation instructions.
14528 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14529 Subtarget, DAG))
14530 return Rotate;
14531
14532 if (SDValue BitBlend =
14533 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14534 return BitBlend;
14535
14536 // Try to use byte shift instructions to mask.
14537 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14538 Zeroable, Subtarget, DAG))
14539 return V;
14540
14541 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14542 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14543 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14544 !Subtarget.hasVLX()) {
14545 // Check if this is part of a 256-bit vector truncation.
14546 unsigned PackOpc = 0;
14547 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14550 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14551 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14552 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14553 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14554 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14555 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14556 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14557 PackOpc = X86ISD::PACKUS;
14558 } else if (Subtarget.hasSSE41()) {
14559 SmallVector<SDValue, 4> DWordClearOps(4,
14560 DAG.getConstant(0, DL, MVT::i32));
14561 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14562 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14563 SDValue DWordClearMask =
14564 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14565 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14566 DWordClearMask);
14567 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14568 DWordClearMask);
14569 PackOpc = X86ISD::PACKUS;
14570 } else if (!Subtarget.hasSSSE3()) {
14571 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14572 V1 = DAG.getBitcast(MVT::v4i32, V1);
14573 V2 = DAG.getBitcast(MVT::v4i32, V2);
14574 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14575 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14576 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14577 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14578 PackOpc = X86ISD::PACKSS;
14579 }
14580 if (PackOpc) {
14581 // Now pack things back together.
14582 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14583 if (NumEvenDrops == 2) {
14584 Result = DAG.getBitcast(MVT::v4i32, Result);
14585 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14586 }
14587 return Result;
14588 }
14589 }
14590
14591 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14592 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14593 if (NumOddDrops == 1) {
14594 bool HasSSE41 = Subtarget.hasSSE41();
14595 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14596 DAG.getBitcast(MVT::v4i32, V1),
14597 DAG.getTargetConstant(16, DL, MVT::i8));
14598 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14599 DAG.getBitcast(MVT::v4i32, V2),
14600 DAG.getTargetConstant(16, DL, MVT::i8));
14601 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14602 MVT::v8i16, V1, V2);
14603 }
14604
14605 // Try to lower by permuting the inputs into an unpack instruction.
14606 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14607 Mask, Subtarget, DAG))
14608 return Unpack;
14609
14610 // If we can't directly blend but can use PSHUFB, that will be better as it
14611 // can both shuffle and set up the inefficient blend.
14612 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14613 bool V1InUse, V2InUse;
14614 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14615 Zeroable, DAG, V1InUse, V2InUse);
14616 }
14617
14618 // We can always bit-blend if we have to so the fallback strategy is to
14619 // decompose into single-input permutes and blends/unpacks.
14620 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14621 Zeroable, Subtarget, DAG);
14622}
14623
14624/// Lower 8-lane 16-bit floating point shuffles.
14626 const APInt &Zeroable, SDValue V1, SDValue V2,
14627 const X86Subtarget &Subtarget,
14628 SelectionDAG &DAG) {
14629 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14630 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14631 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14632 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14633
14634 if (Subtarget.hasFP16()) {
14635 if (NumV2Elements == 0) {
14636 // Check for being able to broadcast a single element.
14637 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14638 Mask, Subtarget, DAG))
14639 return Broadcast;
14640 }
14641 if (NumV2Elements == 1 && Mask[0] >= 8)
14643 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14644 return V;
14645 }
14646
14647 V1 = DAG.getBitcast(MVT::v8i16, V1);
14648 V2 = DAG.getBitcast(MVT::v8i16, V2);
14649 return DAG.getBitcast(MVT::v8f16,
14650 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14651}
14652
14653// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14654// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14655// the active subvector is extracted.
14657 ArrayRef<int> OriginalMask, SDValue V1,
14658 SDValue V2, const X86Subtarget &Subtarget,
14659 SelectionDAG &DAG) {
14660 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14661 SmallVector<int, 32> Mask(OriginalMask);
14662 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14663 !isShuffleFoldableLoad(V2)) {
14665 std::swap(V1, V2);
14666 }
14667
14668 MVT MaskVT = VT.changeTypeToInteger();
14669 SDValue MaskNode;
14670 MVT ShuffleVT = VT;
14671 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14672 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14673 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14674 ShuffleVT = V1.getSimpleValueType();
14675
14676 // Adjust mask to correct indices for the second input.
14677 int NumElts = VT.getVectorNumElements();
14678 unsigned Scale = 512 / VT.getSizeInBits();
14679 SmallVector<int, 32> AdjustedMask(Mask);
14680 for (int &M : AdjustedMask)
14681 if (NumElts <= M)
14682 M += (Scale - 1) * NumElts;
14683 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14684 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14685 } else {
14686 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14687 }
14688
14689 SDValue Result;
14690 if (V2.isUndef())
14691 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14692 else
14693 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14694
14695 if (VT != ShuffleVT)
14696 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14697
14698 return Result;
14699}
14700
14701/// Generic lowering of v16i8 shuffles.
14702///
14703/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14704/// detect any complexity reducing interleaving. If that doesn't help, it uses
14705/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14706/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14707/// back together.
14709 const APInt &Zeroable, SDValue V1, SDValue V2,
14710 const X86Subtarget &Subtarget,
14711 SelectionDAG &DAG) {
14712 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14713 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14714 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14715
14716 // Try to use shift instructions.
14717 if (SDValue Shift =
14718 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14719 DAG, /*BitwiseOnly*/ false))
14720 return Shift;
14721
14722 // Try to use byte rotation instructions.
14723 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14724 Subtarget, DAG))
14725 return Rotate;
14726
14727 // Use dedicated pack instructions for masks that match their pattern.
14728 if (SDValue V =
14729 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14730 return V;
14731
14732 // Try to use a zext lowering.
14733 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14734 Zeroable, Subtarget, DAG))
14735 return ZExt;
14736
14737 // Try to use lower using a truncation.
14738 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14739 Subtarget, DAG))
14740 return V;
14741
14742 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14743 Subtarget, DAG))
14744 return V;
14745
14746 // See if we can use SSE4A Extraction / Insertion.
14747 if (Subtarget.hasSSE4A())
14748 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14749 Zeroable, DAG))
14750 return V;
14751
14752 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14753
14754 // For single-input shuffles, there are some nicer lowering tricks we can use.
14755 if (NumV2Elements == 0) {
14756 // Check for being able to broadcast a single element.
14757 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14758 Mask, Subtarget, DAG))
14759 return Broadcast;
14760
14761 // Try to use bit rotation instructions.
14762 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14763 Subtarget, DAG))
14764 return Rotate;
14765
14766 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14767 return V;
14768
14769 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14770 // Notably, this handles splat and partial-splat shuffles more efficiently.
14771 // However, it only makes sense if the pre-duplication shuffle simplifies
14772 // things significantly. Currently, this means we need to be able to
14773 // express the pre-duplication shuffle as an i16 shuffle.
14774 //
14775 // FIXME: We should check for other patterns which can be widened into an
14776 // i16 shuffle as well.
14777 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14778 for (int i = 0; i < 16; i += 2)
14779 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14780 return false;
14781
14782 return true;
14783 };
14784 auto tryToWidenViaDuplication = [&]() -> SDValue {
14785 if (!canWidenViaDuplication(Mask))
14786 return SDValue();
14787 SmallVector<int, 4> LoInputs;
14788 copy_if(Mask, std::back_inserter(LoInputs),
14789 [](int M) { return M >= 0 && M < 8; });
14790 array_pod_sort(LoInputs.begin(), LoInputs.end());
14791 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14792 SmallVector<int, 4> HiInputs;
14793 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14794 array_pod_sort(HiInputs.begin(), HiInputs.end());
14795 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14796
14797 bool TargetLo = LoInputs.size() >= HiInputs.size();
14798 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14799 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14800
14801 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14803 for (int I : InPlaceInputs) {
14804 PreDupI16Shuffle[I/2] = I/2;
14805 LaneMap[I] = I;
14806 }
14807 int j = TargetLo ? 0 : 4, je = j + 4;
14808 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14809 // Check if j is already a shuffle of this input. This happens when
14810 // there are two adjacent bytes after we move the low one.
14811 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14812 // If we haven't yet mapped the input, search for a slot into which
14813 // we can map it.
14814 while (j < je && PreDupI16Shuffle[j] >= 0)
14815 ++j;
14816
14817 if (j == je)
14818 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14819 return SDValue();
14820
14821 // Map this input with the i16 shuffle.
14822 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14823 }
14824
14825 // Update the lane map based on the mapping we ended up with.
14826 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14827 }
14828 V1 = DAG.getBitcast(
14829 MVT::v16i8,
14830 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14831 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14832
14833 // Unpack the bytes to form the i16s that will be shuffled into place.
14834 bool EvenInUse = false, OddInUse = false;
14835 for (int i = 0; i < 16; i += 2) {
14836 EvenInUse |= (Mask[i + 0] >= 0);
14837 OddInUse |= (Mask[i + 1] >= 0);
14838 if (EvenInUse && OddInUse)
14839 break;
14840 }
14841 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14842 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14843 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14844
14845 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14846 for (int i = 0; i < 16; ++i)
14847 if (Mask[i] >= 0) {
14848 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14849 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14850 if (PostDupI16Shuffle[i / 2] < 0)
14851 PostDupI16Shuffle[i / 2] = MappedMask;
14852 else
14853 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14854 "Conflicting entries in the original shuffle!");
14855 }
14856 return DAG.getBitcast(
14857 MVT::v16i8,
14858 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14859 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14860 };
14861 if (SDValue V = tryToWidenViaDuplication())
14862 return V;
14863 }
14864
14865 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14866 Zeroable, Subtarget, DAG))
14867 return Masked;
14868
14869 // Use dedicated unpack instructions for masks that match their pattern.
14870 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14871 return V;
14872
14873 // Try to use byte shift instructions to mask.
14874 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14875 Zeroable, Subtarget, DAG))
14876 return V;
14877
14878 // Check for compaction patterns.
14879 bool IsSingleInput = V2.isUndef();
14880 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14881
14882 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14883 // with PSHUFB. It is important to do this before we attempt to generate any
14884 // blends but after all of the single-input lowerings. If the single input
14885 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14886 // want to preserve that and we can DAG combine any longer sequences into
14887 // a PSHUFB in the end. But once we start blending from multiple inputs,
14888 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14889 // and there are *very* few patterns that would actually be faster than the
14890 // PSHUFB approach because of its ability to zero lanes.
14891 //
14892 // If the mask is a binary compaction, we can more efficiently perform this
14893 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14894 //
14895 // FIXME: The only exceptions to the above are blends which are exact
14896 // interleavings with direct instructions supporting them. We currently don't
14897 // handle those well here.
14898 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14899 bool V1InUse = false;
14900 bool V2InUse = false;
14901
14903 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14904
14905 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14906 // do so. This avoids using them to handle blends-with-zero which is
14907 // important as a single pshufb is significantly faster for that.
14908 if (V1InUse && V2InUse) {
14909 if (Subtarget.hasSSE41())
14910 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14911 Zeroable, Subtarget, DAG))
14912 return Blend;
14913
14914 // We can use an unpack to do the blending rather than an or in some
14915 // cases. Even though the or may be (very minorly) more efficient, we
14916 // preference this lowering because there are common cases where part of
14917 // the complexity of the shuffles goes away when we do the final blend as
14918 // an unpack.
14919 // FIXME: It might be worth trying to detect if the unpack-feeding
14920 // shuffles will both be pshufb, in which case we shouldn't bother with
14921 // this.
14923 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14924 return Unpack;
14925
14926 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14927 if (Subtarget.hasVBMI())
14928 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14929 DAG);
14930
14931 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14932 if (Subtarget.hasXOP()) {
14933 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14934 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14935 }
14936
14937 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14938 // PALIGNR will be cheaper than the second PSHUFB+OR.
14940 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14941 return V;
14942 }
14943
14944 return PSHUFB;
14945 }
14946
14947 // There are special ways we can lower some single-element blends.
14948 if (NumV2Elements == 1)
14950 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14951 return V;
14952
14953 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14954 return Blend;
14955
14956 // Check whether a compaction lowering can be done. This handles shuffles
14957 // which take every Nth element for some even N. See the helper function for
14958 // details.
14959 //
14960 // We special case these as they can be particularly efficiently handled with
14961 // the PACKUSB instruction on x86 and they show up in common patterns of
14962 // rearranging bytes to truncate wide elements.
14963 if (NumEvenDrops) {
14964 // NumEvenDrops is the power of two stride of the elements. Another way of
14965 // thinking about it is that we need to drop the even elements this many
14966 // times to get the original input.
14967
14968 // First we need to zero all the dropped bytes.
14969 assert(NumEvenDrops <= 3 &&
14970 "No support for dropping even elements more than 3 times.");
14971 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14972 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14973 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14974 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14975 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14976 WordClearMask);
14977 if (!IsSingleInput)
14978 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14979 WordClearMask);
14980
14981 // Now pack things back together.
14982 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14983 IsSingleInput ? V1 : V2);
14984 for (int i = 1; i < NumEvenDrops; ++i) {
14985 Result = DAG.getBitcast(MVT::v8i16, Result);
14986 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14987 }
14988 return Result;
14989 }
14990
14991 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14992 if (NumOddDrops == 1) {
14993 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14994 DAG.getBitcast(MVT::v8i16, V1),
14995 DAG.getTargetConstant(8, DL, MVT::i8));
14996 if (!IsSingleInput)
14997 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14998 DAG.getBitcast(MVT::v8i16, V2),
14999 DAG.getTargetConstant(8, DL, MVT::i8));
15000 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15001 IsSingleInput ? V1 : V2);
15002 }
15003
15004 // Handle multi-input cases by blending/unpacking single-input shuffles.
15005 if (NumV2Elements > 0)
15006 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15007 Zeroable, Subtarget, DAG);
15008
15009 // The fallback path for single-input shuffles widens this into two v8i16
15010 // vectors with unpacks, shuffles those, and then pulls them back together
15011 // with a pack.
15012 SDValue V = V1;
15013
15014 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15015 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15016 for (int i = 0; i < 16; ++i)
15017 if (Mask[i] >= 0)
15018 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15019
15020 SDValue VLoHalf, VHiHalf;
15021 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15022 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15023 // i16s.
15024 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15025 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15026 // Use a mask to drop the high bytes.
15027 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15028 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15029 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15030
15031 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15032 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15033
15034 // Squash the masks to point directly into VLoHalf.
15035 for (int &M : LoBlendMask)
15036 if (M >= 0)
15037 M /= 2;
15038 for (int &M : HiBlendMask)
15039 if (M >= 0)
15040 M /= 2;
15041 } else {
15042 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15043 // VHiHalf so that we can blend them as i16s.
15044 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15045
15046 VLoHalf = DAG.getBitcast(
15047 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15048 VHiHalf = DAG.getBitcast(
15049 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15050 }
15051
15052 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15053 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15054
15055 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15056}
15057
15058/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15059///
15060/// This routine breaks down the specific type of 128-bit shuffle and
15061/// dispatches to the lowering routines accordingly.
15063 MVT VT, SDValue V1, SDValue V2,
15064 const APInt &Zeroable,
15065 const X86Subtarget &Subtarget,
15066 SelectionDAG &DAG) {
15067 if (VT == MVT::v8bf16) {
15068 V1 = DAG.getBitcast(MVT::v8i16, V1);
15069 V2 = DAG.getBitcast(MVT::v8i16, V2);
15070 return DAG.getBitcast(VT,
15071 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15072 }
15073
15074 switch (VT.SimpleTy) {
15075 case MVT::v2i64:
15076 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15077 case MVT::v2f64:
15078 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15079 case MVT::v4i32:
15080 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15081 case MVT::v4f32:
15082 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15083 case MVT::v8i16:
15084 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15085 case MVT::v8f16:
15086 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15087 case MVT::v16i8:
15088 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15089
15090 default:
15091 llvm_unreachable("Unimplemented!");
15092 }
15093}
15094
15095/// Generic routine to split vector shuffle into half-sized shuffles.
15096///
15097/// This routine just extracts two subvectors, shuffles them independently, and
15098/// then concatenates them back together. This should work effectively with all
15099/// AVX vector shuffle types.
15101 SDValue V2, ArrayRef<int> Mask,
15102 SelectionDAG &DAG, bool SimpleOnly) {
15103 assert(VT.getSizeInBits() >= 256 &&
15104 "Only for 256-bit or wider vector shuffles!");
15105 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15106 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15107
15108 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15109 if (VT == MVT::v8f32) {
15110 SDValue BC1 = peekThroughBitcasts(V1);
15111 SDValue BC2 = peekThroughBitcasts(V2);
15112 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15113 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15114 DAG, SimpleOnly))
15115 return DAG.getBitcast(VT, Split);
15116 }
15117 }
15118
15119 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15120 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15121
15122 int NumElements = VT.getVectorNumElements();
15123 int SplitNumElements = NumElements / 2;
15124 MVT ScalarVT = VT.getVectorElementType();
15125 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15126
15127 // Use splitVector/extractSubVector so that split build-vectors just build two
15128 // narrower build vectors. This helps shuffling with splats and zeros.
15129 auto SplitVector = [&](SDValue V) {
15130 SDValue LoV, HiV;
15131 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15132 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15133 DAG.getBitcast(SplitVT, HiV));
15134 };
15135
15136 SDValue LoV1, HiV1, LoV2, HiV2;
15137 std::tie(LoV1, HiV1) = SplitVector(V1);
15138 std::tie(LoV2, HiV2) = SplitVector(V2);
15139
15140 // Now create two 4-way blends of these half-width vectors.
15141 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15142 bool &UseHiV1, bool &UseLoV2,
15143 bool &UseHiV2) {
15144 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15145 for (int i = 0; i < SplitNumElements; ++i) {
15146 int M = HalfMask[i];
15147 if (M >= NumElements) {
15148 if (M >= NumElements + SplitNumElements)
15149 UseHiV2 = true;
15150 else
15151 UseLoV2 = true;
15152 } else if (M >= 0) {
15153 if (M >= SplitNumElements)
15154 UseHiV1 = true;
15155 else
15156 UseLoV1 = true;
15157 }
15158 }
15159 };
15160
15161 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15162 if (!SimpleOnly)
15163 return true;
15164
15165 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15166 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15167
15168 return !(UseHiV1 || UseHiV2);
15169 };
15170
15171 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15172 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15173 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15174 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15175 for (int i = 0; i < SplitNumElements; ++i) {
15176 int M = HalfMask[i];
15177 if (M >= NumElements) {
15178 V2BlendMask[i] = M - NumElements;
15179 BlendMask[i] = SplitNumElements + i;
15180 } else if (M >= 0) {
15181 V1BlendMask[i] = M;
15182 BlendMask[i] = i;
15183 }
15184 }
15185
15186 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15187 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15188
15189 // Because the lowering happens after all combining takes place, we need to
15190 // manually combine these blend masks as much as possible so that we create
15191 // a minimal number of high-level vector shuffle nodes.
15192 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15193
15194 // First try just blending the halves of V1 or V2.
15195 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15196 return DAG.getUNDEF(SplitVT);
15197 if (!UseLoV2 && !UseHiV2)
15198 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15199 if (!UseLoV1 && !UseHiV1)
15200 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15201
15202 SDValue V1Blend, V2Blend;
15203 if (UseLoV1 && UseHiV1) {
15204 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15205 } else {
15206 // We only use half of V1 so map the usage down into the final blend mask.
15207 V1Blend = UseLoV1 ? LoV1 : HiV1;
15208 for (int i = 0; i < SplitNumElements; ++i)
15209 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15210 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15211 }
15212 if (UseLoV2 && UseHiV2) {
15213 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15214 } else {
15215 // We only use half of V2 so map the usage down into the final blend mask.
15216 V2Blend = UseLoV2 ? LoV2 : HiV2;
15217 for (int i = 0; i < SplitNumElements; ++i)
15218 if (BlendMask[i] >= SplitNumElements)
15219 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15220 }
15221 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15222 };
15223
15224 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15225 return SDValue();
15226
15227 SDValue Lo = HalfBlend(LoMask);
15228 SDValue Hi = HalfBlend(HiMask);
15229 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15230}
15231
15232/// Either split a vector in halves or decompose the shuffles and the
15233/// blend/unpack.
15234///
15235/// This is provided as a good fallback for many lowerings of non-single-input
15236/// shuffles with more than one 128-bit lane. In those cases, we want to select
15237/// between splitting the shuffle into 128-bit components and stitching those
15238/// back together vs. extracting the single-input shuffles and blending those
15239/// results.
15241 SDValue V2, ArrayRef<int> Mask,
15242 const APInt &Zeroable,
15243 const X86Subtarget &Subtarget,
15244 SelectionDAG &DAG) {
15245 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15246 "shuffles as it could then recurse on itself.");
15247 int Size = Mask.size();
15248
15249 // If this can be modeled as a broadcast of two elements followed by a blend,
15250 // prefer that lowering. This is especially important because broadcasts can
15251 // often fold with memory operands.
15252 auto DoBothBroadcast = [&] {
15253 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15254 for (int M : Mask)
15255 if (M >= Size) {
15256 if (V2BroadcastIdx < 0)
15257 V2BroadcastIdx = M - Size;
15258 else if ((M - Size) != V2BroadcastIdx &&
15259 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15260 return false;
15261 } else if (M >= 0) {
15262 if (V1BroadcastIdx < 0)
15263 V1BroadcastIdx = M;
15264 else if (M != V1BroadcastIdx &&
15265 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15266 return false;
15267 }
15268 return true;
15269 };
15270 if (DoBothBroadcast())
15271 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15272 Subtarget, DAG);
15273
15274 // If the inputs all stem from a single 128-bit lane of each input, then we
15275 // split them rather than blending because the split will decompose to
15276 // unusually few instructions.
15277 int LaneCount = VT.getSizeInBits() / 128;
15278 int LaneSize = Size / LaneCount;
15279 SmallBitVector LaneInputs[2];
15280 LaneInputs[0].resize(LaneCount, false);
15281 LaneInputs[1].resize(LaneCount, false);
15282 for (int i = 0; i < Size; ++i)
15283 if (Mask[i] >= 0)
15284 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15285 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15286 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15287 /*SimpleOnly*/ false);
15288
15289 // Without AVX2, if we can freely split the subvectors then we're better off
15290 // performing half width shuffles.
15291 if (!Subtarget.hasAVX2()) {
15292 SDValue BC1 = peekThroughBitcasts(V1);
15293 SDValue BC2 = peekThroughBitcasts(V2);
15294 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15295 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15296 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15297 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15298 if (SplatOrSplitV1 && SplatOrSplitV2)
15299 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15300 /*SimpleOnly*/ false);
15301 }
15302
15303 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15304 // requires that the decomposed single-input shuffles don't end up here.
15305 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15306 Subtarget, DAG);
15307}
15308
15309// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15310// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15312 SDValue V1, SDValue V2,
15313 ArrayRef<int> Mask,
15314 SelectionDAG &DAG) {
15315 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15316
15317 int LHSMask[4] = {-1, -1, -1, -1};
15318 int RHSMask[4] = {-1, -1, -1, -1};
15319 int SHUFPDMask[4] = {-1, -1, -1, -1};
15320
15321 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15322 // perform the shuffle once the lanes have been shuffled in place.
15323 for (int i = 0; i != 4; ++i) {
15324 int M = Mask[i];
15325 if (M < 0)
15326 continue;
15327 int LaneBase = i & ~1;
15328 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15329 LaneMask[LaneBase + (M & 1)] = M;
15330 SHUFPDMask[i] = M & 1;
15331 }
15332
15333 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15334 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15335 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15336 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15337}
15338
15339/// Lower a vector shuffle crossing multiple 128-bit lanes as
15340/// a lane permutation followed by a per-lane permutation.
15341///
15342/// This is mainly for cases where we can have non-repeating permutes
15343/// in each lane.
15344///
15345/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15346/// we should investigate merging them.
15348 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15349 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15350 int NumElts = VT.getVectorNumElements();
15351 int NumLanes = VT.getSizeInBits() / 128;
15352 int NumEltsPerLane = NumElts / NumLanes;
15353 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15354
15355 /// Attempts to find a sublane permute with the given size
15356 /// that gets all elements into their target lanes.
15357 ///
15358 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15359 /// If unsuccessful, returns false and may overwrite InLaneMask.
15360 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15361 int NumSublanesPerLane = NumSublanes / NumLanes;
15362 int NumEltsPerSublane = NumElts / NumSublanes;
15363
15364 SmallVector<int, 16> CrossLaneMask;
15365 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15366 // CrossLaneMask but one entry == one sublane.
15367 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15368 APInt DemandedCrossLane = APInt::getZero(NumElts);
15369
15370 for (int i = 0; i != NumElts; ++i) {
15371 int M = Mask[i];
15372 if (M < 0)
15373 continue;
15374
15375 int SrcSublane = M / NumEltsPerSublane;
15376 int DstLane = i / NumEltsPerLane;
15377
15378 // We only need to get the elements into the right lane, not sublane.
15379 // So search all sublanes that make up the destination lane.
15380 bool Found = false;
15381 int DstSubStart = DstLane * NumSublanesPerLane;
15382 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15383 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15384 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15385 continue;
15386
15387 Found = true;
15388 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15389 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15390 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15391 DemandedCrossLane.setBit(InLaneMask[i]);
15392 break;
15393 }
15394 if (!Found)
15395 return SDValue();
15396 }
15397
15398 // Fill CrossLaneMask using CrossLaneMaskLarge.
15399 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15400
15401 if (!CanUseSublanes) {
15402 // If we're only shuffling a single lowest lane and the rest are identity
15403 // then don't bother.
15404 // TODO - isShuffleMaskInputInPlace could be extended to something like
15405 // this.
15406 int NumIdentityLanes = 0;
15407 bool OnlyShuffleLowestLane = true;
15408 for (int i = 0; i != NumLanes; ++i) {
15409 int LaneOffset = i * NumEltsPerLane;
15410 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15411 i * NumEltsPerLane))
15412 NumIdentityLanes++;
15413 else if (CrossLaneMask[LaneOffset] != 0)
15414 OnlyShuffleLowestLane = false;
15415 }
15416 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15417 return SDValue();
15418 }
15419
15420 // Simplify CrossLaneMask based on the actual demanded elements.
15421 if (V1.hasOneUse())
15422 for (int i = 0; i != NumElts; ++i)
15423 if (!DemandedCrossLane[i])
15424 CrossLaneMask[i] = SM_SentinelUndef;
15425
15426 // Avoid returning the same shuffle operation. For example,
15427 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15428 // undef:v16i16
15429 if (CrossLaneMask == Mask || InLaneMask == Mask)
15430 return SDValue();
15431
15432 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15433 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15434 InLaneMask);
15435 };
15436
15437 // First attempt a solution with full lanes.
15438 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15439 return V;
15440
15441 // The rest of the solutions use sublanes.
15442 if (!CanUseSublanes)
15443 return SDValue();
15444
15445 // Then attempt a solution with 64-bit sublanes (vpermq).
15446 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15447 return V;
15448
15449 // If that doesn't work and we have fast variable cross-lane shuffle,
15450 // attempt 32-bit sublanes (vpermd).
15451 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15452 return SDValue();
15453
15454 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15455}
15456
15457/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15458static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15459 SmallVector<int> &InLaneMask) {
15460 int Size = Mask.size();
15461 InLaneMask.assign(Mask.begin(), Mask.end());
15462 for (int i = 0; i < Size; ++i) {
15463 int &M = InLaneMask[i];
15464 if (M < 0)
15465 continue;
15466 if (((M % Size) / LaneSize) != (i / LaneSize))
15467 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15468 }
15469}
15470
15471/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15472/// source with a lane permutation.
15473///
15474/// This lowering strategy results in four instructions in the worst case for a
15475/// single-input cross lane shuffle which is lower than any other fully general
15476/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15477/// shuffle pattern should be handled prior to trying this lowering.
15479 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15480 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15481 // FIXME: This should probably be generalized for 512-bit vectors as well.
15482 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15483 int Size = Mask.size();
15484 int LaneSize = Size / 2;
15485
15486 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15487 // Only do this if the elements aren't all from the lower lane,
15488 // otherwise we're (probably) better off doing a split.
15489 if (VT == MVT::v4f64 &&
15490 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15491 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15492
15493 // If there are only inputs from one 128-bit lane, splitting will in fact be
15494 // less expensive. The flags track whether the given lane contains an element
15495 // that crosses to another lane.
15496 bool AllLanes;
15497 if (!Subtarget.hasAVX2()) {
15498 bool LaneCrossing[2] = {false, false};
15499 for (int i = 0; i < Size; ++i)
15500 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15501 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15502 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15503 } else {
15504 bool LaneUsed[2] = {false, false};
15505 for (int i = 0; i < Size; ++i)
15506 if (Mask[i] >= 0)
15507 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15508 AllLanes = LaneUsed[0] && LaneUsed[1];
15509 }
15510
15511 // TODO - we could support shuffling V2 in the Flipped input.
15512 assert(V2.isUndef() &&
15513 "This last part of this routine only works on single input shuffles");
15514
15515 SmallVector<int> InLaneMask;
15516 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15517
15518 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15519 "In-lane shuffle mask expected");
15520
15521 // If we're not using both lanes in each lane and the inlane mask is not
15522 // repeating, then we're better off splitting.
15523 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15524 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15525 /*SimpleOnly*/ false);
15526
15527 // Flip the lanes, and shuffle the results which should now be in-lane.
15528 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15529 SDValue Flipped = DAG.getBitcast(PVT, V1);
15530 Flipped =
15531 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15532 Flipped = DAG.getBitcast(VT, Flipped);
15533 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15534}
15535
15536/// Handle lowering 2-lane 128-bit shuffles.
15538 SDValue V2, ArrayRef<int> Mask,
15539 const APInt &Zeroable,
15540 const X86Subtarget &Subtarget,
15541 SelectionDAG &DAG) {
15542 if (V2.isUndef()) {
15543 // Attempt to match VBROADCAST*128 subvector broadcast load.
15544 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15545 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15546 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15548 MVT MemVT = VT.getHalfNumVectorElementsVT();
15549 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15552 VT, MemVT, Ld, Ofs, DAG))
15553 return BcstLd;
15554 }
15555
15556 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15557 if (Subtarget.hasAVX2())
15558 return SDValue();
15559 }
15560
15561 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15562
15563 SmallVector<int, 4> WidenedMask;
15564 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15565 return SDValue();
15566
15567 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15568 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15569
15570 // Try to use an insert into a zero vector.
15571 if (WidenedMask[0] == 0 && IsHighZero) {
15572 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15573 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15574 DAG.getVectorIdxConstant(0, DL));
15575 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15576 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15577 DAG.getVectorIdxConstant(0, DL));
15578 }
15579
15580 // TODO: If minimizing size and one of the inputs is a zero vector and the
15581 // the zero vector has only one use, we could use a VPERM2X128 to save the
15582 // instruction bytes needed to explicitly generate the zero vector.
15583
15584 // Blends are faster and handle all the non-lane-crossing cases.
15585 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15586 Subtarget, DAG))
15587 return Blend;
15588
15589 // If either input operand is a zero vector, use VPERM2X128 because its mask
15590 // allows us to replace the zero input with an implicit zero.
15591 if (!IsLowZero && !IsHighZero) {
15592 // Check for patterns which can be matched with a single insert of a 128-bit
15593 // subvector.
15594 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15595 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15596
15597 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15598 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15600 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15601 SDValue SubVec =
15602 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15603 DAG.getVectorIdxConstant(0, DL));
15604 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15605 DAG.getVectorIdxConstant(2, DL));
15606 }
15607 }
15608
15609 // Try to use SHUF128 if possible.
15610 if (Subtarget.hasVLX()) {
15611 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15612 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15613 ((WidenedMask[1] % 2) << 1);
15614 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15615 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15616 }
15617 }
15618 }
15619
15620 // Otherwise form a 128-bit permutation. After accounting for undefs,
15621 // convert the 64-bit shuffle mask selection values into 128-bit
15622 // selection bits by dividing the indexes by 2 and shifting into positions
15623 // defined by a vperm2*128 instruction's immediate control byte.
15624
15625 // The immediate permute control byte looks like this:
15626 // [1:0] - select 128 bits from sources for low half of destination
15627 // [2] - ignore
15628 // [3] - zero low half of destination
15629 // [5:4] - select 128 bits from sources for high half of destination
15630 // [6] - ignore
15631 // [7] - zero high half of destination
15632
15633 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15634 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15635
15636 unsigned PermMask = 0;
15637 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15638 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15639
15640 // Check the immediate mask and replace unused sources with undef.
15641 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15642 V1 = DAG.getUNDEF(VT);
15643 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15644 V2 = DAG.getUNDEF(VT);
15645
15646 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15647 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15648}
15649
15650/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15651/// shuffling each lane.
15652///
15653/// This attempts to create a repeated lane shuffle where each lane uses one
15654/// or two of the lanes of the inputs. The lanes of the input vectors are
15655/// shuffled in one or two independent shuffles to get the lanes into the
15656/// position needed by the final shuffle.
15658 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15659 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15660 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15661
15662 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15663 return SDValue();
15664
15665 int NumElts = Mask.size();
15666 int NumLanes = VT.getSizeInBits() / 128;
15667 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15668 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15669 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15670
15671 // First pass will try to fill in the RepeatMask from lanes that need two
15672 // sources.
15673 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15674 int Srcs[2] = {-1, -1};
15675 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15676 for (int i = 0; i != NumLaneElts; ++i) {
15677 int M = Mask[(Lane * NumLaneElts) + i];
15678 if (M < 0)
15679 continue;
15680 // Determine which of the possible input lanes (NumLanes from each source)
15681 // this element comes from. Assign that as one of the sources for this
15682 // lane. We can assign up to 2 sources for this lane. If we run out
15683 // sources we can't do anything.
15684 int LaneSrc = M / NumLaneElts;
15685 int Src;
15686 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15687 Src = 0;
15688 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15689 Src = 1;
15690 else
15691 return SDValue();
15692
15693 Srcs[Src] = LaneSrc;
15694 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15695 }
15696
15697 // If this lane has two sources, see if it fits with the repeat mask so far.
15698 if (Srcs[1] < 0)
15699 continue;
15700
15701 LaneSrcs[Lane][0] = Srcs[0];
15702 LaneSrcs[Lane][1] = Srcs[1];
15703
15704 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15705 assert(M1.size() == M2.size() && "Unexpected mask size");
15706 for (int i = 0, e = M1.size(); i != e; ++i)
15707 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15708 return false;
15709 return true;
15710 };
15711
15712 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15713 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15714 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15715 int M = Mask[i];
15716 if (M < 0)
15717 continue;
15718 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15719 "Unexpected mask element");
15720 MergedMask[i] = M;
15721 }
15722 };
15723
15724 if (MatchMasks(InLaneMask, RepeatMask)) {
15725 // Merge this lane mask into the final repeat mask.
15726 MergeMasks(InLaneMask, RepeatMask);
15727 continue;
15728 }
15729
15730 // Didn't find a match. Swap the operands and try again.
15731 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15733
15734 if (MatchMasks(InLaneMask, RepeatMask)) {
15735 // Merge this lane mask into the final repeat mask.
15736 MergeMasks(InLaneMask, RepeatMask);
15737 continue;
15738 }
15739
15740 // Couldn't find a match with the operands in either order.
15741 return SDValue();
15742 }
15743
15744 // Now handle any lanes with only one source.
15745 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15746 // If this lane has already been processed, skip it.
15747 if (LaneSrcs[Lane][0] >= 0)
15748 continue;
15749
15750 for (int i = 0; i != NumLaneElts; ++i) {
15751 int M = Mask[(Lane * NumLaneElts) + i];
15752 if (M < 0)
15753 continue;
15754
15755 // If RepeatMask isn't defined yet we can define it ourself.
15756 if (RepeatMask[i] < 0)
15757 RepeatMask[i] = M % NumLaneElts;
15758
15759 if (RepeatMask[i] < NumElts) {
15760 if (RepeatMask[i] != M % NumLaneElts)
15761 return SDValue();
15762 LaneSrcs[Lane][0] = M / NumLaneElts;
15763 } else {
15764 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15765 return SDValue();
15766 LaneSrcs[Lane][1] = M / NumLaneElts;
15767 }
15768 }
15769
15770 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15771 return SDValue();
15772 }
15773
15774 SmallVector<int, 16> NewMask(NumElts, -1);
15775 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15776 int Src = LaneSrcs[Lane][0];
15777 for (int i = 0; i != NumLaneElts; ++i) {
15778 int M = -1;
15779 if (Src >= 0)
15780 M = Src * NumLaneElts + i;
15781 NewMask[Lane * NumLaneElts + i] = M;
15782 }
15783 }
15784 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15785 // Ensure we didn't get back the shuffle we started with.
15786 // FIXME: This is a hack to make up for some splat handling code in
15787 // getVectorShuffle.
15788 if (isa<ShuffleVectorSDNode>(NewV1) &&
15789 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15790 return SDValue();
15791
15792 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15793 int Src = LaneSrcs[Lane][1];
15794 for (int i = 0; i != NumLaneElts; ++i) {
15795 int M = -1;
15796 if (Src >= 0)
15797 M = Src * NumLaneElts + i;
15798 NewMask[Lane * NumLaneElts + i] = M;
15799 }
15800 }
15801 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15802 // Ensure we didn't get back the shuffle we started with.
15803 // FIXME: This is a hack to make up for some splat handling code in
15804 // getVectorShuffle.
15805 if (isa<ShuffleVectorSDNode>(NewV2) &&
15806 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15807 return SDValue();
15808
15809 for (int i = 0; i != NumElts; ++i) {
15810 if (Mask[i] < 0) {
15811 NewMask[i] = -1;
15812 continue;
15813 }
15814 NewMask[i] = RepeatMask[i % NumLaneElts];
15815 if (NewMask[i] < 0)
15816 continue;
15817
15818 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15819 }
15820 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15821}
15822
15823/// If the input shuffle mask results in a vector that is undefined in all upper
15824/// or lower half elements and that mask accesses only 2 halves of the
15825/// shuffle's operands, return true. A mask of half the width with mask indexes
15826/// adjusted to access the extracted halves of the original shuffle operands is
15827/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15828/// lower half of each input operand is accessed.
15829static bool
15831 int &HalfIdx1, int &HalfIdx2) {
15832 assert((Mask.size() == HalfMask.size() * 2) &&
15833 "Expected input mask to be twice as long as output");
15834
15835 // Exactly one half of the result must be undef to allow narrowing.
15836 bool UndefLower = isUndefLowerHalf(Mask);
15837 bool UndefUpper = isUndefUpperHalf(Mask);
15838 if (UndefLower == UndefUpper)
15839 return false;
15840
15841 unsigned HalfNumElts = HalfMask.size();
15842 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15843 HalfIdx1 = -1;
15844 HalfIdx2 = -1;
15845 for (unsigned i = 0; i != HalfNumElts; ++i) {
15846 int M = Mask[i + MaskIndexOffset];
15847 if (M < 0) {
15848 HalfMask[i] = M;
15849 continue;
15850 }
15851
15852 // Determine which of the 4 half vectors this element is from.
15853 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15854 int HalfIdx = M / HalfNumElts;
15855
15856 // Determine the element index into its half vector source.
15857 int HalfElt = M % HalfNumElts;
15858
15859 // We can shuffle with up to 2 half vectors, set the new 'half'
15860 // shuffle mask accordingly.
15861 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15862 HalfMask[i] = HalfElt;
15863 HalfIdx1 = HalfIdx;
15864 continue;
15865 }
15866 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15867 HalfMask[i] = HalfElt + HalfNumElts;
15868 HalfIdx2 = HalfIdx;
15869 continue;
15870 }
15871
15872 // Too many half vectors referenced.
15873 return false;
15874 }
15875
15876 return true;
15877}
15878
15879/// Given the output values from getHalfShuffleMask(), create a half width
15880/// shuffle of extracted vectors followed by an insert back to full width.
15882 ArrayRef<int> HalfMask, int HalfIdx1,
15883 int HalfIdx2, bool UndefLower,
15884 SelectionDAG &DAG, bool UseConcat = false) {
15885 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15886 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15887
15888 MVT VT = V1.getSimpleValueType();
15889 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15890 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15891
15892 auto getHalfVector = [&](int HalfIdx) {
15893 if (HalfIdx < 0)
15894 return DAG.getUNDEF(HalfVT);
15895 SDValue V = (HalfIdx < 2 ? V1 : V2);
15896 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15897 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15898 DAG.getVectorIdxConstant(HalfIdx, DL));
15899 };
15900
15901 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15902 SDValue Half1 = getHalfVector(HalfIdx1);
15903 SDValue Half2 = getHalfVector(HalfIdx2);
15904 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15905 if (UseConcat) {
15906 SDValue Op0 = V;
15907 SDValue Op1 = DAG.getUNDEF(HalfVT);
15908 if (UndefLower)
15909 std::swap(Op0, Op1);
15910 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15911 }
15912
15913 unsigned Offset = UndefLower ? HalfNumElts : 0;
15914 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15916}
15917
15918/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15919/// This allows for fast cases such as subvector extraction/insertion
15920/// or shuffling smaller vector types which can lower more efficiently.
15922 SDValue V2, ArrayRef<int> Mask,
15923 const X86Subtarget &Subtarget,
15924 SelectionDAG &DAG) {
15925 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15926 "Expected 256-bit or 512-bit vector");
15927
15928 bool UndefLower = isUndefLowerHalf(Mask);
15929 if (!UndefLower && !isUndefUpperHalf(Mask))
15930 return SDValue();
15931
15932 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15933 "Completely undef shuffle mask should have been simplified already");
15934
15935 // Upper half is undef and lower half is whole upper subvector.
15936 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15937 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15938 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15939 if (!UndefLower &&
15940 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15941 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15942 DAG.getVectorIdxConstant(HalfNumElts, DL));
15943 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15944 DAG.getVectorIdxConstant(0, DL));
15945 }
15946
15947 // Lower half is undef and upper half is whole lower subvector.
15948 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15949 if (UndefLower &&
15950 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15951 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15952 DAG.getVectorIdxConstant(0, DL));
15953 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15954 DAG.getVectorIdxConstant(HalfNumElts, DL));
15955 }
15956
15957 int HalfIdx1, HalfIdx2;
15958 SmallVector<int, 8> HalfMask(HalfNumElts);
15959 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15960 return SDValue();
15961
15962 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15963
15964 // Only shuffle the halves of the inputs when useful.
15965 unsigned NumLowerHalves =
15966 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15967 unsigned NumUpperHalves =
15968 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15969 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15970
15971 // Determine the larger pattern of undef/halves, then decide if it's worth
15972 // splitting the shuffle based on subtarget capabilities and types.
15973 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15974 if (!UndefLower) {
15975 // XXXXuuuu: no insert is needed.
15976 // Always extract lowers when setting lower - these are all free subreg ops.
15977 if (NumUpperHalves == 0)
15978 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15979 UndefLower, DAG);
15980
15981 if (NumUpperHalves == 1) {
15982 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15983 if (Subtarget.hasAVX2()) {
15984 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15985 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15986 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15987 (!isSingleSHUFPSMask(HalfMask) ||
15988 Subtarget.hasFastVariableCrossLaneShuffle()))
15989 return SDValue();
15990 // If this is an unary shuffle (assume that the 2nd operand is
15991 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15992 // are better off extracting the upper half of 1 operand and using a
15993 // narrow shuffle.
15994 if (EltWidth == 64 && V2.isUndef())
15995 return SDValue();
15996 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15997 // full width pshufb, and then merge.
15998 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15999 return SDValue();
16000 }
16001 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16002 if (Subtarget.hasAVX512() && VT.is512BitVector())
16003 return SDValue();
16004 // Extract + narrow shuffle is better than the wide alternative.
16005 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16006 UndefLower, DAG);
16007 }
16008
16009 // Don't extract both uppers, instead shuffle and then extract.
16010 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16011 return SDValue();
16012 }
16013
16014 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16015 if (NumUpperHalves == 0) {
16016 // AVX2 has efficient 64-bit element cross-lane shuffles.
16017 // TODO: Refine to account for unary shuffle, splat, and other masks?
16018 if (Subtarget.hasAVX2() && EltWidth == 64)
16019 return SDValue();
16020 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16021 if (Subtarget.hasAVX512() && VT.is512BitVector())
16022 return SDValue();
16023 // Narrow shuffle + insert is better than the wide alternative.
16024 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16025 UndefLower, DAG);
16026 }
16027
16028 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16029 return SDValue();
16030}
16031
16032/// Handle case where shuffle sources are coming from the same 128-bit lane and
16033/// every lane can be represented as the same repeating mask - allowing us to
16034/// shuffle the sources with the repeating shuffle and then permute the result
16035/// to the destination lanes.
16037 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16038 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16039 int NumElts = VT.getVectorNumElements();
16040 int NumLanes = VT.getSizeInBits() / 128;
16041 int NumLaneElts = NumElts / NumLanes;
16042
16043 // On AVX2 we may be able to just shuffle the lowest elements and then
16044 // broadcast the result.
16045 if (Subtarget.hasAVX2()) {
16046 for (unsigned BroadcastSize : {16, 32, 64}) {
16047 if (BroadcastSize <= VT.getScalarSizeInBits())
16048 continue;
16049 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16050
16051 // Attempt to match a repeating pattern every NumBroadcastElts,
16052 // accounting for UNDEFs but only references the lowest 128-bit
16053 // lane of the inputs.
16054 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16055 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16056 for (int j = 0; j != NumBroadcastElts; ++j) {
16057 int M = Mask[i + j];
16058 if (M < 0)
16059 continue;
16060 int &R = RepeatMask[j];
16061 if (0 != ((M % NumElts) / NumLaneElts))
16062 return false;
16063 if (0 <= R && R != M)
16064 return false;
16065 R = M;
16066 }
16067 return true;
16068 };
16069
16070 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16071 if (!FindRepeatingBroadcastMask(RepeatMask))
16072 continue;
16073
16074 // Shuffle the (lowest) repeated elements in place for broadcast.
16075 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16076
16077 // Shuffle the actual broadcast.
16078 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16079 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16080 for (int j = 0; j != NumBroadcastElts; ++j)
16081 BroadcastMask[i + j] = j;
16082
16083 // Avoid returning the same shuffle operation. For example,
16084 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16085 if (BroadcastMask == Mask)
16086 return SDValue();
16087
16088 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16089 BroadcastMask);
16090 }
16091 }
16092
16093 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16094 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16095 return SDValue();
16096
16097 // Bail if we already have a repeated lane shuffle mask.
16098 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16099 return SDValue();
16100
16101 // Helper to look for repeated mask in each split sublane, and that those
16102 // sublanes can then be permuted into place.
16103 auto ShuffleSubLanes = [&](int SubLaneScale) {
16104 int NumSubLanes = NumLanes * SubLaneScale;
16105 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16106
16107 // Check that all the sources are coming from the same lane and see if we
16108 // can form a repeating shuffle mask (local to each sub-lane). At the same
16109 // time, determine the source sub-lane for each destination sub-lane.
16110 int TopSrcSubLane = -1;
16111 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16112 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16113 SubLaneScale,
16114 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16115
16116 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16117 // Extract the sub-lane mask, check that it all comes from the same lane
16118 // and normalize the mask entries to come from the first lane.
16119 int SrcLane = -1;
16120 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16121 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16122 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16123 if (M < 0)
16124 continue;
16125 int Lane = (M % NumElts) / NumLaneElts;
16126 if ((0 <= SrcLane) && (SrcLane != Lane))
16127 return SDValue();
16128 SrcLane = Lane;
16129 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16130 SubLaneMask[Elt] = LocalM;
16131 }
16132
16133 // Whole sub-lane is UNDEF.
16134 if (SrcLane < 0)
16135 continue;
16136
16137 // Attempt to match against the candidate repeated sub-lane masks.
16138 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16139 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16140 for (int i = 0; i != NumSubLaneElts; ++i) {
16141 if (M1[i] < 0 || M2[i] < 0)
16142 continue;
16143 if (M1[i] != M2[i])
16144 return false;
16145 }
16146 return true;
16147 };
16148
16149 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16150 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16151 continue;
16152
16153 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16154 for (int i = 0; i != NumSubLaneElts; ++i) {
16155 int M = SubLaneMask[i];
16156 if (M < 0)
16157 continue;
16158 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16159 "Unexpected mask element");
16160 RepeatedSubLaneMask[i] = M;
16161 }
16162
16163 // Track the top most source sub-lane - by setting the remaining to
16164 // UNDEF we can greatly simplify shuffle matching.
16165 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16166 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16167 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16168 break;
16169 }
16170
16171 // Bail if we failed to find a matching repeated sub-lane mask.
16172 if (Dst2SrcSubLanes[DstSubLane] < 0)
16173 return SDValue();
16174 }
16175 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16176 "Unexpected source lane");
16177
16178 // Create a repeating shuffle mask for the entire vector.
16179 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16180 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16181 int Lane = SubLane / SubLaneScale;
16182 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16183 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16184 int M = RepeatedSubLaneMask[Elt];
16185 if (M < 0)
16186 continue;
16187 int Idx = (SubLane * NumSubLaneElts) + Elt;
16188 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16189 }
16190 }
16191
16192 // Shuffle each source sub-lane to its destination.
16193 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16194 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16195 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16196 if (SrcSubLane < 0)
16197 continue;
16198 for (int j = 0; j != NumSubLaneElts; ++j)
16199 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16200 }
16201
16202 // Avoid returning the same shuffle operation.
16203 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16204 if (RepeatedMask == Mask || SubLaneMask == Mask)
16205 return SDValue();
16206
16207 SDValue RepeatedShuffle =
16208 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16209
16210 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16211 SubLaneMask);
16212 };
16213
16214 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16215 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16216 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16217 // Otherwise we can only permute whole 128-bit lanes.
16218 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16219 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16220 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16221 MinSubLaneScale = 2;
16222 MaxSubLaneScale =
16223 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16224 }
16225 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16226 MinSubLaneScale = MaxSubLaneScale = 4;
16227
16228 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16229 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16230 return Shuffle;
16231
16232 return SDValue();
16233}
16234
16236 bool &ForceV1Zero, bool &ForceV2Zero,
16237 unsigned &ShuffleImm, ArrayRef<int> Mask,
16238 const APInt &Zeroable) {
16239 int NumElts = VT.getVectorNumElements();
16240 assert(VT.getScalarSizeInBits() == 64 &&
16241 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16242 "Unexpected data type for VSHUFPD");
16243 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16244 "Illegal shuffle mask");
16245
16246 bool ZeroLane[2] = { true, true };
16247 for (int i = 0; i < NumElts; ++i)
16248 ZeroLane[i & 1] &= Zeroable[i];
16249
16250 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16251 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16252 bool IsSHUFPD = true;
16253 bool IsCommutable = true;
16254 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16255 for (int i = 0; i < NumElts; ++i) {
16256 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16257 continue;
16258 if (Mask[i] < 0)
16259 return false;
16260 int Val = (i & 6) + NumElts * (i & 1);
16261 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16262 if (Mask[i] < Val || Mask[i] > Val + 1)
16263 IsSHUFPD = false;
16264 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16265 IsCommutable = false;
16266 SHUFPDMask[i] = Mask[i] % 2;
16267 }
16268
16269 if (!IsSHUFPD && !IsCommutable)
16270 return false;
16271
16272 if (!IsSHUFPD && IsCommutable)
16273 std::swap(V1, V2);
16274
16275 ForceV1Zero = ZeroLane[0];
16276 ForceV2Zero = ZeroLane[1];
16277 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16278 return true;
16279}
16280
16282 SDValue V2, ArrayRef<int> Mask,
16283 const APInt &Zeroable,
16284 const X86Subtarget &Subtarget,
16285 SelectionDAG &DAG) {
16286 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16287 "Unexpected data type for VSHUFPD");
16288
16289 unsigned Immediate = 0;
16290 bool ForceV1Zero = false, ForceV2Zero = false;
16291 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16292 Mask, Zeroable))
16293 return SDValue();
16294
16295 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16296 if (ForceV1Zero)
16297 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16298 if (ForceV2Zero)
16299 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16300
16301 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16302 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16303}
16304
16305// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16306// by zeroable elements in the remaining 24 elements. Turn this into two
16307// vmovqb instructions shuffled together.
16309 SDValue V1, SDValue V2,
16310 ArrayRef<int> Mask,
16311 const APInt &Zeroable,
16312 SelectionDAG &DAG) {
16313 assert(VT == MVT::v32i8 && "Unexpected type!");
16314
16315 // The first 8 indices should be every 8th element.
16316 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16317 return SDValue();
16318
16319 // Remaining elements need to be zeroable.
16320 if (Zeroable.countl_one() < (Mask.size() - 8))
16321 return SDValue();
16322
16323 V1 = DAG.getBitcast(MVT::v4i64, V1);
16324 V2 = DAG.getBitcast(MVT::v4i64, V2);
16325
16326 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16327 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16328
16329 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16330 // the upper bits of the result using an unpckldq.
16331 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16332 { 0, 1, 2, 3, 16, 17, 18, 19,
16333 4, 5, 6, 7, 20, 21, 22, 23 });
16334 // Insert the unpckldq into a zero vector to widen to v32i8.
16335 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16336 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16337 DAG.getVectorIdxConstant(0, DL));
16338}
16339
16340// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16341// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16342// =>
16343// ul = unpckl v1, v2
16344// uh = unpckh v1, v2
16345// a = vperm ul, uh
16346// b = vperm ul, uh
16347//
16348// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16349// and permute. We cannot directly match v3 because it is split into two
16350// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16351// pair of 256-bit shuffles and makes sure the masks are consecutive.
16352//
16353// Once unpck and permute nodes are created, the permute corresponding to this
16354// shuffle is returned, while the other permute replaces the other half of the
16355// shuffle in the selection dag.
16357 SDValue V1, SDValue V2,
16358 ArrayRef<int> Mask,
16359 SelectionDAG &DAG) {
16360 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16361 VT != MVT::v32i8)
16362 return SDValue();
16363 // <B0, B1, B0+1, B1+1, ..., >
16364 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16365 unsigned Begin1) {
16366 size_t Size = Mask.size();
16367 assert(Size % 2 == 0 && "Expected even mask size");
16368 for (unsigned I = 0; I < Size; I += 2) {
16369 if (Mask[I] != (int)(Begin0 + I / 2) ||
16370 Mask[I + 1] != (int)(Begin1 + I / 2))
16371 return false;
16372 }
16373 return true;
16374 };
16375 // Check which half is this shuffle node
16376 int NumElts = VT.getVectorNumElements();
16377 size_t FirstQtr = NumElts / 2;
16378 size_t ThirdQtr = NumElts + NumElts / 2;
16379 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16380 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16381 if (!IsFirstHalf && !IsSecondHalf)
16382 return SDValue();
16383
16384 // Find the intersection between shuffle users of V1 and V2.
16385 SmallVector<SDNode *, 2> Shuffles;
16386 for (SDNode *User : V1->users())
16387 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16388 User->getOperand(1) == V2)
16389 Shuffles.push_back(User);
16390 // Limit user size to two for now.
16391 if (Shuffles.size() != 2)
16392 return SDValue();
16393 // Find out which half of the 512-bit shuffles is each smaller shuffle
16394 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16395 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16396 SDNode *FirstHalf;
16397 SDNode *SecondHalf;
16398 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16399 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16400 FirstHalf = Shuffles[0];
16401 SecondHalf = Shuffles[1];
16402 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16403 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16404 FirstHalf = Shuffles[1];
16405 SecondHalf = Shuffles[0];
16406 } else {
16407 return SDValue();
16408 }
16409 // Lower into unpck and perm. Return the perm of this shuffle and replace
16410 // the other.
16411 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16412 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16413 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16414 DAG.getTargetConstant(0x20, DL, MVT::i8));
16415 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16416 DAG.getTargetConstant(0x31, DL, MVT::i8));
16417 if (IsFirstHalf) {
16418 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16419 return Perm1;
16420 }
16421 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16422 return Perm2;
16423}
16424
16425/// Handle lowering of 4-lane 64-bit floating point shuffles.
16426///
16427/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16428/// isn't available.
16430 const APInt &Zeroable, SDValue V1, SDValue V2,
16431 const X86Subtarget &Subtarget,
16432 SelectionDAG &DAG) {
16433 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16434 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16435 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16436
16437 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16438 Subtarget, DAG))
16439 return V;
16440
16441 if (V2.isUndef()) {
16442 // Check for being able to broadcast a single element.
16443 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16444 Mask, Subtarget, DAG))
16445 return Broadcast;
16446
16447 // Use low duplicate instructions for masks that match their pattern.
16448 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16449 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16450
16451 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16452 // Non-half-crossing single input shuffles can be lowered with an
16453 // interleaved permutation.
16454 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16455 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16456 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16457 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16458 }
16459
16460 // With AVX2 we have direct support for this permutation.
16461 if (Subtarget.hasAVX2())
16462 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16463 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16464
16465 // Try to create an in-lane repeating shuffle mask and then shuffle the
16466 // results into the target lanes.
16468 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16469 return V;
16470
16471 // Try to permute the lanes and then use a per-lane permute.
16472 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16473 Mask, DAG, Subtarget))
16474 return V;
16475
16476 // Otherwise, fall back.
16477 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16478 DAG, Subtarget);
16479 }
16480
16481 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16482 Zeroable, Subtarget, DAG))
16483 return Blend;
16484
16485 // Use dedicated unpack instructions for masks that match their pattern.
16486 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16487 return V;
16488
16489 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16490 Zeroable, Subtarget, DAG))
16491 return Op;
16492
16493 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16494 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16495 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16496 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16497
16498 // If we have lane crossing shuffles AND they don't all come from the lower
16499 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16500 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16501 // canonicalize to a blend of splat which isn't necessary for this combine.
16502 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16503 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16504 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16505 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16506 (!Subtarget.hasAVX2() ||
16507 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16508 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16509
16510 // If we have one input in place, then we can permute the other input and
16511 // blend the result.
16512 if (V1IsInPlace || V2IsInPlace)
16513 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16514 Zeroable, Subtarget, DAG);
16515
16516 // Try to create an in-lane repeating shuffle mask and then shuffle the
16517 // results into the target lanes.
16519 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16520 return V;
16521
16522 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16523 // shuffle. However, if we have AVX2 and either inputs are already in place,
16524 // we will be able to shuffle even across lanes the other input in a single
16525 // instruction so skip this pattern.
16526 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16528 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16529 return V;
16530
16531 // If we have VLX support, we can use VEXPAND.
16532 if (Subtarget.hasVLX())
16533 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16534 Zeroable, Subtarget, DAG))
16535 return V;
16536
16537 // If we have AVX2 then we always want to lower with a blend because an v4 we
16538 // can fully permute the elements.
16539 if (Subtarget.hasAVX2())
16540 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16541 Zeroable, Subtarget, DAG);
16542
16543 // Otherwise fall back on generic lowering.
16544 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16545 Subtarget, DAG);
16546}
16547
16548/// Handle lowering of 4-lane 64-bit integer shuffles.
16549///
16550/// This routine is only called when we have AVX2 and thus a reasonable
16551/// instruction set for v4i64 shuffling..
16553 const APInt &Zeroable, SDValue V1, SDValue V2,
16554 const X86Subtarget &Subtarget,
16555 SelectionDAG &DAG) {
16556 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16557 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16558 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16559 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16560
16561 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16562 Subtarget, DAG))
16563 return V;
16564
16565 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG))
16567 return Blend;
16568
16569 // Check for being able to broadcast a single element.
16570 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16571 Subtarget, DAG))
16572 return Broadcast;
16573
16574 // Try to use shift instructions if fast.
16575 if (Subtarget.preferLowerShuffleAsShift())
16576 if (SDValue Shift =
16577 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16578 Subtarget, DAG, /*BitwiseOnly*/ true))
16579 return Shift;
16580
16581 if (V2.isUndef()) {
16582 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16583 // can use lower latency instructions that will operate on both lanes.
16584 SmallVector<int, 2> RepeatedMask;
16585 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16586 SmallVector<int, 4> PSHUFDMask;
16587 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16588 return DAG.getBitcast(
16589 MVT::v4i64,
16590 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16591 DAG.getBitcast(MVT::v8i32, V1),
16592 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16593 }
16594
16595 // AVX2 provides a direct instruction for permuting a single input across
16596 // lanes.
16597 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16599 }
16600
16601 // Try to use shift instructions.
16602 if (SDValue Shift =
16603 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16604 DAG, /*BitwiseOnly*/ false))
16605 return Shift;
16606
16607 // If we have VLX support, we can use VALIGN or VEXPAND.
16608 if (Subtarget.hasVLX()) {
16609 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16610 Zeroable, Subtarget, DAG))
16611 return Rotate;
16612
16613 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16614 Zeroable, Subtarget, DAG))
16615 return V;
16616 }
16617
16618 // Try to use PALIGNR.
16619 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16620 Subtarget, DAG))
16621 return Rotate;
16622
16623 // Use dedicated unpack instructions for masks that match their pattern.
16624 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16625 return V;
16626
16627 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16628 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16629
16630 // If we have one input in place, then we can permute the other input and
16631 // blend the result.
16632 if (V1IsInPlace || V2IsInPlace)
16633 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16634 Zeroable, Subtarget, DAG);
16635
16636 // Try to create an in-lane repeating shuffle mask and then shuffle the
16637 // results into the target lanes.
16639 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16640 return V;
16641
16642 // Try to lower to PERMQ(BLENDD(V1,V2)).
16643 if (SDValue V =
16644 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16645 return V;
16646
16647 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16648 // shuffle. However, if we have AVX2 and either inputs are already in place,
16649 // we will be able to shuffle even across lanes the other input in a single
16650 // instruction so skip this pattern.
16651 if (!V1IsInPlace && !V2IsInPlace)
16653 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16654 return Result;
16655
16656 // Otherwise fall back on generic blend lowering.
16657 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16658 Zeroable, Subtarget, DAG);
16659}
16660
16661/// Handle lowering of 8-lane 32-bit floating point shuffles.
16662///
16663/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16664/// isn't available.
16666 const APInt &Zeroable, SDValue V1, SDValue V2,
16667 const X86Subtarget &Subtarget,
16668 SelectionDAG &DAG) {
16669 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16670 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16671 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16672
16673 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16674 Zeroable, Subtarget, DAG))
16675 return Blend;
16676
16677 // Check for being able to broadcast a single element.
16678 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16679 Subtarget, DAG))
16680 return Broadcast;
16681
16682 if (!Subtarget.hasAVX2()) {
16683 SmallVector<int> InLaneMask;
16684 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16685
16686 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16687 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16688 /*SimpleOnly*/ true))
16689 return R;
16690 }
16691 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16692 Zeroable, Subtarget, DAG))
16693 return DAG.getBitcast(MVT::v8f32, ZExt);
16694
16695 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16696 // options to efficiently lower the shuffle.
16697 SmallVector<int, 4> RepeatedMask;
16698 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16699 assert(RepeatedMask.size() == 4 &&
16700 "Repeated masks must be half the mask width!");
16701
16702 // Use even/odd duplicate instructions for masks that match their pattern.
16703 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16704 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16705 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16706 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16707
16708 if (V2.isUndef())
16709 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16710 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16711
16712 // Use dedicated unpack instructions for masks that match their pattern.
16713 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16714 return V;
16715
16716 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16717 // have already handled any direct blends.
16718 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16719 }
16720
16721 // Try to create an in-lane repeating shuffle mask and then shuffle the
16722 // results into the target lanes.
16724 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16725 return V;
16726
16727 // If we have a single input shuffle with different shuffle patterns in the
16728 // two 128-bit lanes use the variable mask to VPERMILPS.
16729 if (V2.isUndef()) {
16730 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16731 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16732 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16733 }
16734 if (Subtarget.hasAVX2()) {
16735 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16736 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16737 }
16738 // Otherwise, fall back.
16739 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16740 DAG, Subtarget);
16741 }
16742
16743 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16744 // shuffle.
16746 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16747 return Result;
16748
16749 // If we have VLX support, we can use VEXPAND.
16750 if (Subtarget.hasVLX())
16751 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16752 Zeroable, Subtarget, DAG))
16753 return V;
16754
16755 // Try to match an interleave of two v8f32s and lower them as unpck and
16756 // permutes using ymms. This needs to go before we try to split the vectors.
16757 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16758 if ((Subtarget.hasAVX2() ||
16761 !Subtarget.hasAVX512())
16762 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16763 Mask, DAG))
16764 return V;
16765
16766 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16767 // since after split we get a more efficient code using vpunpcklwd and
16768 // vpunpckhwd instrs than vblend.
16769 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16770 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16771 Subtarget, DAG);
16772
16773 // If we have AVX2 then we always want to lower with a blend because at v8 we
16774 // can fully permute the elements.
16775 if (Subtarget.hasAVX2())
16776 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16777 Zeroable, Subtarget, DAG);
16778
16779 // Otherwise fall back on generic lowering.
16780 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16781 Subtarget, DAG);
16782}
16783
16784/// Handle lowering of 8-lane 32-bit integer shuffles.
16785///
16786/// This routine is only called when we have AVX2 and thus a reasonable
16787/// instruction set for v8i32 shuffling..
16789 const APInt &Zeroable, SDValue V1, SDValue V2,
16790 const X86Subtarget &Subtarget,
16791 SelectionDAG &DAG) {
16792 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16793 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16794 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16795 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16796
16797 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16798
16799 // Whenever we can lower this as a zext, that instruction is strictly faster
16800 // than any alternative. It also allows us to fold memory operands into the
16801 // shuffle in many cases.
16802 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16803 Zeroable, Subtarget, DAG))
16804 return ZExt;
16805
16806 // Try to match an interleave of two v8i32s and lower them as unpck and
16807 // permutes using ymms. This needs to go before we try to split the vectors.
16808 if (!Subtarget.hasAVX512())
16809 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16810 Mask, DAG))
16811 return V;
16812
16813 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16814 // since after split we get a more efficient code than vblend by using
16815 // vpunpcklwd and vpunpckhwd instrs.
16816 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16817 !Subtarget.hasAVX512())
16818 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16819 Subtarget, DAG);
16820
16821 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16822 Zeroable, Subtarget, DAG))
16823 return Blend;
16824
16825 // Check for being able to broadcast a single element.
16826 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16827 Subtarget, DAG))
16828 return Broadcast;
16829
16830 // Try to use shift instructions if fast.
16831 if (Subtarget.preferLowerShuffleAsShift()) {
16832 if (SDValue Shift =
16833 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16834 Subtarget, DAG, /*BitwiseOnly*/ true))
16835 return Shift;
16836 if (NumV2Elements == 0)
16837 if (SDValue Rotate =
16838 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16839 return Rotate;
16840 }
16841
16842 // If the shuffle mask is repeated in each 128-bit lane we can use more
16843 // efficient instructions that mirror the shuffles across the two 128-bit
16844 // lanes.
16845 SmallVector<int, 4> RepeatedMask;
16846 bool Is128BitLaneRepeatedShuffle =
16847 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16848 if (Is128BitLaneRepeatedShuffle) {
16849 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16850 if (V2.isUndef())
16851 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16852 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16853
16854 // Use dedicated unpack instructions for masks that match their pattern.
16855 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16856 return V;
16857 }
16858
16859 // Try to use shift instructions.
16860 if (SDValue Shift =
16861 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16862 DAG, /*BitwiseOnly*/ false))
16863 return Shift;
16864
16865 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16866 if (SDValue Rotate =
16867 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16868 return Rotate;
16869
16870 // If we have VLX support, we can use VALIGN or EXPAND.
16871 if (Subtarget.hasVLX()) {
16872 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16873 Zeroable, Subtarget, DAG))
16874 return Rotate;
16875
16876 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16877 Zeroable, Subtarget, DAG))
16878 return V;
16879 }
16880
16881 // Try to use byte rotation instructions.
16882 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16883 Subtarget, DAG))
16884 return Rotate;
16885
16886 // Try to create an in-lane repeating shuffle mask and then shuffle the
16887 // results into the target lanes.
16889 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16890 return V;
16891
16892 if (V2.isUndef()) {
16893 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16894 // because that should be faster than the variable permute alternatives.
16895 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16896 return V;
16897
16898 // If the shuffle patterns aren't repeated but it's a single input, directly
16899 // generate a cross-lane VPERMD instruction.
16900 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16901 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16902 }
16903
16904 // Assume that a single SHUFPS is faster than an alternative sequence of
16905 // multiple instructions (even if the CPU has a domain penalty).
16906 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16907 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16908 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16909 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16910 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16911 CastV1, CastV2, DAG);
16912 return DAG.getBitcast(MVT::v8i32, ShufPS);
16913 }
16914
16915 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16916 // shuffle.
16918 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16919 return Result;
16920
16921 // Otherwise fall back on generic blend lowering.
16922 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16923 Zeroable, Subtarget, DAG);
16924}
16925
16926/// Handle lowering of 16-lane 16-bit integer shuffles.
16927///
16928/// This routine is only called when we have AVX2 and thus a reasonable
16929/// instruction set for v16i16 shuffling..
16931 const APInt &Zeroable, SDValue V1, SDValue V2,
16932 const X86Subtarget &Subtarget,
16933 SelectionDAG &DAG) {
16934 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16935 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16936 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16937 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16938
16939 // Whenever we can lower this as a zext, that instruction is strictly faster
16940 // than any alternative. It also allows us to fold memory operands into the
16941 // shuffle in many cases.
16943 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16944 return ZExt;
16945
16946 // Check for being able to broadcast a single element.
16947 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16948 Subtarget, DAG))
16949 return Broadcast;
16950
16951 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16952 Zeroable, Subtarget, DAG))
16953 return Blend;
16954
16955 // Use dedicated unpack instructions for masks that match their pattern.
16956 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16957 return V;
16958
16959 // Use dedicated pack instructions for masks that match their pattern.
16960 if (SDValue V =
16961 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16962 return V;
16963
16964 // Try to use lower using a truncation.
16965 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16966 Subtarget, DAG))
16967 return V;
16968
16969 // Try to use shift instructions.
16970 if (SDValue Shift =
16971 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16972 Subtarget, DAG, /*BitwiseOnly*/ false))
16973 return Shift;
16974
16975 // Try to use byte rotation instructions.
16976 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16977 Subtarget, DAG))
16978 return Rotate;
16979
16980 // Try to create an in-lane repeating shuffle mask and then shuffle the
16981 // results into the target lanes.
16983 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16984 return V;
16985
16986 if (V2.isUndef()) {
16987 // Try to use bit rotation instructions.
16988 if (SDValue Rotate =
16989 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16990 return Rotate;
16991
16992 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16993 // because that should be faster than the variable permute alternatives.
16994 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16995 return V;
16996
16997 // There are no generalized cross-lane shuffle operations available on i16
16998 // element types.
16999 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17001 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17002 return V;
17003
17004 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17005 DAG, Subtarget);
17006 }
17007
17008 SmallVector<int, 8> RepeatedMask;
17009 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17010 // As this is a single-input shuffle, the repeated mask should be
17011 // a strictly valid v8i16 mask that we can pass through to the v8i16
17012 // lowering to handle even the v16 case.
17014 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17015 }
17016 }
17017
17018 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17019 Zeroable, Subtarget, DAG))
17020 return PSHUFB;
17021
17022 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17023 if (Subtarget.hasBWI())
17024 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17025
17026 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17027 // shuffle.
17029 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17030 return Result;
17031
17032 // Try to permute the lanes and then use a per-lane permute.
17034 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17035 return V;
17036
17037 // Try to match an interleave of two v16i16s and lower them as unpck and
17038 // permutes using ymms.
17039 if (!Subtarget.hasAVX512())
17040 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17041 Mask, DAG))
17042 return V;
17043
17044 // Otherwise fall back on generic lowering.
17045 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17046 Subtarget, DAG);
17047}
17048
17049/// Handle lowering of 32-lane 8-bit integer shuffles.
17050///
17051/// This routine is only called when we have AVX2 and thus a reasonable
17052/// instruction set for v32i8 shuffling..
17054 const APInt &Zeroable, SDValue V1, SDValue V2,
17055 const X86Subtarget &Subtarget,
17056 SelectionDAG &DAG) {
17057 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17058 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17059 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17060 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17061
17062 // Whenever we can lower this as a zext, that instruction is strictly faster
17063 // than any alternative. It also allows us to fold memory operands into the
17064 // shuffle in many cases.
17065 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17066 Zeroable, Subtarget, DAG))
17067 return ZExt;
17068
17069 // Check for being able to broadcast a single element.
17070 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17071 Subtarget, DAG))
17072 return Broadcast;
17073
17074 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17075 Zeroable, Subtarget, DAG))
17076 return Blend;
17077
17078 // Use dedicated unpack instructions for masks that match their pattern.
17079 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17080 return V;
17081
17082 // Use dedicated pack instructions for masks that match their pattern.
17083 if (SDValue V =
17084 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17085 return V;
17086
17087 // Try to use lower using a truncation.
17088 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17089 Subtarget, DAG))
17090 return V;
17091
17092 // Try to use shift instructions.
17093 if (SDValue Shift =
17094 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17095 DAG, /*BitwiseOnly*/ false))
17096 return Shift;
17097
17098 // Try to use byte rotation instructions.
17099 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17100 Subtarget, DAG))
17101 return Rotate;
17102
17103 // Try to use bit rotation instructions.
17104 if (V2.isUndef())
17105 if (SDValue Rotate =
17106 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17107 return Rotate;
17108
17109 // Try to create an in-lane repeating shuffle mask and then shuffle the
17110 // results into the target lanes.
17112 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17113 return V;
17114
17115 // There are no generalized cross-lane shuffle operations available on i8
17116 // element types.
17117 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17118 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17119 // because that should be faster than the variable permute alternatives.
17120 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17121 return V;
17122
17124 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17125 return V;
17126
17127 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17128 DAG, Subtarget);
17129 }
17130
17131 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17132 Zeroable, Subtarget, DAG))
17133 return PSHUFB;
17134
17135 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17136 if (Subtarget.hasVBMI())
17137 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17138
17139 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17140 // shuffle.
17142 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17143 return Result;
17144
17145 // Try to permute the lanes and then use a per-lane permute.
17147 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17148 return V;
17149
17150 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17151 // by zeroable elements in the remaining 24 elements. Turn this into two
17152 // vmovqb instructions shuffled together.
17153 if (Subtarget.hasVLX())
17154 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17155 Mask, Zeroable, DAG))
17156 return V;
17157
17158 // Try to match an interleave of two v32i8s and lower them as unpck and
17159 // permutes using ymms.
17160 if (!Subtarget.hasAVX512())
17161 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17162 Mask, DAG))
17163 return V;
17164
17165 // Otherwise fall back on generic lowering.
17166 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17167 Subtarget, DAG);
17168}
17169
17170/// High-level routine to lower various 256-bit x86 vector shuffles.
17171///
17172/// This routine either breaks down the specific type of a 256-bit x86 vector
17173/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17174/// together based on the available instructions.
17176 SDValue V1, SDValue V2, const APInt &Zeroable,
17177 const X86Subtarget &Subtarget,
17178 SelectionDAG &DAG) {
17179 // If we have a single input to the zero element, insert that into V1 if we
17180 // can do so cheaply.
17181 int NumElts = VT.getVectorNumElements();
17182 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17183
17184 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17186 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17187 return Insertion;
17188
17189 // Handle special cases where the lower or upper half is UNDEF.
17190 if (SDValue V =
17191 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17192 return V;
17193
17194 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17195 // can check for those subtargets here and avoid much of the subtarget
17196 // querying in the per-vector-type lowering routines. With AVX1 we have
17197 // essentially *zero* ability to manipulate a 256-bit vector with integer
17198 // types. Since we'll use floating point types there eventually, just
17199 // immediately cast everything to a float and operate entirely in that domain.
17200 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17201 int ElementBits = VT.getScalarSizeInBits();
17202 if (ElementBits < 32) {
17203 // No floating point type available, if we can't use the bit operations
17204 // for masking/blending then decompose into 128-bit vectors.
17205 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17206 Subtarget, DAG))
17207 return V;
17208 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17209 return V;
17210 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17211 }
17212
17213 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17215 V1 = DAG.getBitcast(FpVT, V1);
17216 V2 = DAG.getBitcast(FpVT, V2);
17217 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17218 }
17219
17220 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17221 V1 = DAG.getBitcast(MVT::v16i16, V1);
17222 V2 = DAG.getBitcast(MVT::v16i16, V2);
17223 return DAG.getBitcast(VT,
17224 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17225 }
17226
17227 switch (VT.SimpleTy) {
17228 case MVT::v4f64:
17229 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17230 case MVT::v4i64:
17231 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17232 case MVT::v8f32:
17233 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17234 case MVT::v8i32:
17235 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17236 case MVT::v16i16:
17237 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17238 case MVT::v32i8:
17239 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17240
17241 default:
17242 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17243 }
17244}
17245
17246/// Try to lower a vector shuffle as a 128-bit shuffles.
17248 const APInt &Zeroable, SDValue V1, SDValue V2,
17249 const X86Subtarget &Subtarget,
17250 SelectionDAG &DAG) {
17251 assert(VT.getScalarSizeInBits() == 64 &&
17252 "Unexpected element type size for 128bit shuffle.");
17253
17254 // To handle 256 bit vector requires VLX and most probably
17255 // function lowerV2X128VectorShuffle() is better solution.
17256 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17257
17258 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17259 SmallVector<int, 4> Widened128Mask;
17260 if (!canWidenShuffleElements(Mask, Widened128Mask))
17261 return SDValue();
17262 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17263
17264 // Try to use an insert into a zero vector.
17265 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17266 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17267 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17268 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17269 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17270 DAG.getVectorIdxConstant(0, DL));
17271 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17272 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17273 DAG.getVectorIdxConstant(0, DL));
17274 }
17275
17276 // Check for patterns which can be matched with a single insert of a 256-bit
17277 // subvector.
17278 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17279 if (OnlyUsesV1 ||
17280 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17281 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17282 SDValue SubVec =
17283 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17284 DAG.getVectorIdxConstant(0, DL));
17285 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17286 DAG.getVectorIdxConstant(4, DL));
17287 }
17288
17289 // See if this is an insertion of the lower 128-bits of V2 into V1.
17290 bool IsInsert = true;
17291 int V2Index = -1;
17292 for (int i = 0; i < 4; ++i) {
17293 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17294 if (Widened128Mask[i] < 0)
17295 continue;
17296
17297 // Make sure all V1 subvectors are in place.
17298 if (Widened128Mask[i] < 4) {
17299 if (Widened128Mask[i] != i) {
17300 IsInsert = false;
17301 break;
17302 }
17303 } else {
17304 // Make sure we only have a single V2 index and its the lowest 128-bits.
17305 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17306 IsInsert = false;
17307 break;
17308 }
17309 V2Index = i;
17310 }
17311 }
17312 if (IsInsert && V2Index >= 0) {
17313 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17314 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17315 DAG.getVectorIdxConstant(0, DL));
17316 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17317 }
17318
17319 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17320 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17321 // possible we at least ensure the lanes stay sequential to help later
17322 // combines.
17323 SmallVector<int, 2> Widened256Mask;
17324 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17325 Widened128Mask.clear();
17326 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17327 }
17328
17329 // Try to lower to vshuf64x2/vshuf32x4.
17330 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17331 int PermMask[4] = {-1, -1, -1, -1};
17332 // Ensure elements came from the same Op.
17333 for (int i = 0; i < 4; ++i) {
17334 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17335 if (Widened128Mask[i] < 0)
17336 continue;
17337
17338 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17339 unsigned OpIndex = i / 2;
17340 if (Ops[OpIndex].isUndef())
17341 Ops[OpIndex] = Op;
17342 else if (Ops[OpIndex] != Op)
17343 return SDValue();
17344
17345 PermMask[i] = Widened128Mask[i] % 4;
17346 }
17347
17348 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17349 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17350}
17351
17352/// Handle lowering of 8-lane 64-bit floating point shuffles.
17354 const APInt &Zeroable, SDValue V1, SDValue V2,
17355 const X86Subtarget &Subtarget,
17356 SelectionDAG &DAG) {
17357 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17358 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17359 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17360
17361 if (V2.isUndef()) {
17362 // Use low duplicate instructions for masks that match their pattern.
17363 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17364 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17365
17366 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17367 // Non-half-crossing single input shuffles can be lowered with an
17368 // interleaved permutation.
17369 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17370 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17371 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17372 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17373 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17374 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17375 }
17376
17377 SmallVector<int, 4> RepeatedMask;
17378 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17379 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17380 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17381 }
17382
17383 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17384 V2, Subtarget, DAG))
17385 return Shuf128;
17386
17387 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17388 return Unpck;
17389
17390 // Check if the blend happens to exactly fit that of SHUFPD.
17391 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17392 Zeroable, Subtarget, DAG))
17393 return Op;
17394
17395 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17396 Subtarget, DAG))
17397 return V;
17398
17399 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17400 Zeroable, Subtarget, DAG))
17401 return Blend;
17402
17403 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17404}
17405
17406/// Handle lowering of 16-lane 32-bit floating point shuffles.
17408 const APInt &Zeroable, SDValue V1, SDValue V2,
17409 const X86Subtarget &Subtarget,
17410 SelectionDAG &DAG) {
17411 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17412 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17413 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17414
17415 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17416 // options to efficiently lower the shuffle.
17417 SmallVector<int, 4> RepeatedMask;
17418 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17419 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17420
17421 // Use even/odd duplicate instructions for masks that match their pattern.
17422 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17423 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17424 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17425 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17426
17427 if (V2.isUndef())
17428 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17429 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17430
17431 // Use dedicated unpack instructions for masks that match their pattern.
17432 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17433 return V;
17434
17435 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17436 Zeroable, Subtarget, DAG))
17437 return Blend;
17438
17439 // Otherwise, fall back to a SHUFPS sequence.
17440 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17441 }
17442
17443 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17444 Zeroable, Subtarget, DAG))
17445 return Blend;
17446
17448 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17449 return DAG.getBitcast(MVT::v16f32, ZExt);
17450
17451 // Try to create an in-lane repeating shuffle mask and then shuffle the
17452 // results into the target lanes.
17454 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17455 return V;
17456
17457 // If we have a single input shuffle with different shuffle patterns in the
17458 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17459 if (V2.isUndef() &&
17460 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17461 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17462 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17463 }
17464
17465 // If we have AVX512F support, we can use VEXPAND.
17466 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17467 Zeroable, Subtarget, DAG))
17468 return V;
17469
17470 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17471}
17472
17473/// Handle lowering of 8-lane 64-bit integer shuffles.
17475 const APInt &Zeroable, SDValue V1, SDValue V2,
17476 const X86Subtarget &Subtarget,
17477 SelectionDAG &DAG) {
17478 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17479 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17480 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17481
17482 // Try to use shift instructions if fast.
17483 if (Subtarget.preferLowerShuffleAsShift())
17484 if (SDValue Shift =
17485 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17486 Subtarget, DAG, /*BitwiseOnly*/ true))
17487 return Shift;
17488
17489 if (V2.isUndef()) {
17490 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17491 // can use lower latency instructions that will operate on all four
17492 // 128-bit lanes.
17493 SmallVector<int, 2> Repeated128Mask;
17494 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17495 SmallVector<int, 4> PSHUFDMask;
17496 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17497 return DAG.getBitcast(
17498 MVT::v8i64,
17499 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17500 DAG.getBitcast(MVT::v16i32, V1),
17501 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17502 }
17503
17504 SmallVector<int, 4> Repeated256Mask;
17505 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17506 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17507 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17508 }
17509
17510 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17511 V2, Subtarget, DAG))
17512 return Shuf128;
17513
17514 // Try to use shift instructions.
17515 if (SDValue Shift =
17516 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17517 DAG, /*BitwiseOnly*/ false))
17518 return Shift;
17519
17520 // Try to use VALIGN.
17521 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17522 Zeroable, Subtarget, DAG))
17523 return Rotate;
17524
17525 // Try to use PALIGNR.
17526 if (Subtarget.hasBWI())
17527 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17528 Subtarget, DAG))
17529 return Rotate;
17530
17531 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17532 return Unpck;
17533
17534 // If we have AVX512F support, we can use VEXPAND.
17535 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17536 Subtarget, DAG))
17537 return V;
17538
17539 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17540 Zeroable, Subtarget, DAG))
17541 return Blend;
17542
17543 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17544}
17545
17546/// Handle lowering of 16-lane 32-bit integer shuffles.
17548 const APInt &Zeroable, SDValue V1, SDValue V2,
17549 const X86Subtarget &Subtarget,
17550 SelectionDAG &DAG) {
17551 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17552 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17553 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17554
17555 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17556
17557 // Whenever we can lower this as a zext, that instruction is strictly faster
17558 // than any alternative. It also allows us to fold memory operands into the
17559 // shuffle in many cases.
17561 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17562 return ZExt;
17563
17564 // Try to use shift instructions if fast.
17565 if (Subtarget.preferLowerShuffleAsShift()) {
17566 if (SDValue Shift =
17567 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17568 Subtarget, DAG, /*BitwiseOnly*/ true))
17569 return Shift;
17570 if (NumV2Elements == 0)
17571 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17572 Subtarget, DAG))
17573 return Rotate;
17574 }
17575
17576 // If the shuffle mask is repeated in each 128-bit lane we can use more
17577 // efficient instructions that mirror the shuffles across the four 128-bit
17578 // lanes.
17579 SmallVector<int, 4> RepeatedMask;
17580 bool Is128BitLaneRepeatedShuffle =
17581 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17582 if (Is128BitLaneRepeatedShuffle) {
17583 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17584 if (V2.isUndef())
17585 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17586 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17587
17588 // Use dedicated unpack instructions for masks that match their pattern.
17589 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17590 return V;
17591 }
17592
17593 // Try to use shift instructions.
17594 if (SDValue Shift =
17595 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17596 Subtarget, DAG, /*BitwiseOnly*/ false))
17597 return Shift;
17598
17599 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17600 if (SDValue Rotate =
17601 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17602 return Rotate;
17603
17604 // Try to use VALIGN.
17605 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17606 Zeroable, Subtarget, DAG))
17607 return Rotate;
17608
17609 // Try to use byte rotation instructions.
17610 if (Subtarget.hasBWI())
17611 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17612 Subtarget, DAG))
17613 return Rotate;
17614
17615 // Assume that a single SHUFPS is faster than using a permv shuffle.
17616 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17617 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17618 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17619 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17620 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17621 CastV1, CastV2, DAG);
17622 return DAG.getBitcast(MVT::v16i32, ShufPS);
17623 }
17624
17625 // Try to create an in-lane repeating shuffle mask and then shuffle the
17626 // results into the target lanes.
17628 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17629 return V;
17630
17631 // If we have AVX512F support, we can use VEXPAND.
17632 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17633 Zeroable, Subtarget, DAG))
17634 return V;
17635
17636 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17637 Zeroable, Subtarget, DAG))
17638 return Blend;
17639
17640 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17641}
17642
17643/// Handle lowering of 32-lane 16-bit integer shuffles.
17645 const APInt &Zeroable, SDValue V1, SDValue V2,
17646 const X86Subtarget &Subtarget,
17647 SelectionDAG &DAG) {
17648 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17649 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17650 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17651 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17652
17653 // Whenever we can lower this as a zext, that instruction is strictly faster
17654 // than any alternative. It also allows us to fold memory operands into the
17655 // shuffle in many cases.
17657 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17658 return ZExt;
17659
17660 // Use dedicated unpack instructions for masks that match their pattern.
17661 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17662 return V;
17663
17664 // Use dedicated pack instructions for masks that match their pattern.
17665 if (SDValue V =
17666 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17667 return V;
17668
17669 // Try to use shift instructions.
17670 if (SDValue Shift =
17671 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17672 Subtarget, DAG, /*BitwiseOnly*/ false))
17673 return Shift;
17674
17675 // Try to use byte rotation instructions.
17676 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17677 Subtarget, DAG))
17678 return Rotate;
17679
17680 if (V2.isUndef()) {
17681 // Try to use bit rotation instructions.
17682 if (SDValue Rotate =
17683 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17684 return Rotate;
17685
17686 SmallVector<int, 8> RepeatedMask;
17687 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17688 // As this is a single-input shuffle, the repeated mask should be
17689 // a strictly valid v8i16 mask that we can pass through to the v8i16
17690 // lowering to handle even the v32 case.
17691 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17692 RepeatedMask, Subtarget, DAG);
17693 }
17694 }
17695
17696 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17697 Zeroable, Subtarget, DAG))
17698 return Blend;
17699
17700 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17701 Zeroable, Subtarget, DAG))
17702 return PSHUFB;
17703
17704 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17705 // shuffle.
17706 if (!V2.isUndef())
17708 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17709 return Result;
17710
17711 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17712}
17713
17714/// Handle lowering of 64-lane 8-bit integer shuffles.
17716 const APInt &Zeroable, SDValue V1, SDValue V2,
17717 const X86Subtarget &Subtarget,
17718 SelectionDAG &DAG) {
17719 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17720 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17721 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17722 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17723
17724 // Whenever we can lower this as a zext, that instruction is strictly faster
17725 // than any alternative. It also allows us to fold memory operands into the
17726 // shuffle in many cases.
17728 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17729 return ZExt;
17730
17731 // Use dedicated unpack instructions for masks that match their pattern.
17732 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17733 return V;
17734
17735 // Use dedicated pack instructions for masks that match their pattern.
17736 if (SDValue V =
17737 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17738 return V;
17739
17740 // Try to use shift instructions.
17741 if (SDValue Shift =
17742 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17743 DAG, /*BitwiseOnly*/ false))
17744 return Shift;
17745
17746 // Try to use byte rotation instructions.
17747 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17748 Subtarget, DAG))
17749 return Rotate;
17750
17751 // Try to use bit rotation instructions.
17752 if (V2.isUndef())
17753 if (SDValue Rotate =
17754 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17755 return Rotate;
17756
17757 // Lower as AND if possible.
17758 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17759 Zeroable, Subtarget, DAG))
17760 return Masked;
17761
17762 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17763 Zeroable, Subtarget, DAG))
17764 return PSHUFB;
17765
17766 // Try to create an in-lane repeating shuffle mask and then shuffle the
17767 // results into the target lanes.
17769 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17770 return V;
17771
17773 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17774 return Result;
17775
17776 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17777 Zeroable, Subtarget, DAG))
17778 return Blend;
17779
17780 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17781 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17782 // PALIGNR will be cheaper than the second PSHUFB+OR.
17783 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17784 Mask, Subtarget, DAG))
17785 return V;
17786
17787 // If we can't directly blend but can use PSHUFB, that will be better as it
17788 // can both shuffle and set up the inefficient blend.
17789 bool V1InUse, V2InUse;
17790 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17791 DAG, V1InUse, V2InUse);
17792 }
17793
17794 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17795 // shuffle.
17796 if (!V2.isUndef())
17798 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17799 return Result;
17800
17801 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17802 if (Subtarget.hasVBMI())
17803 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17804
17805 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17806}
17807
17808/// High-level routine to lower various 512-bit x86 vector shuffles.
17809///
17810/// This routine either breaks down the specific type of a 512-bit x86 vector
17811/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17812/// together based on the available instructions.
17814 MVT VT, SDValue V1, SDValue V2,
17815 const APInt &Zeroable,
17816 const X86Subtarget &Subtarget,
17817 SelectionDAG &DAG) {
17818 assert(Subtarget.hasAVX512() &&
17819 "Cannot lower 512-bit vectors w/ basic ISA!");
17820
17821 // If we have a single input to the zero element, insert that into V1 if we
17822 // can do so cheaply.
17823 int NumElts = Mask.size();
17824 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17825
17826 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17828 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17829 return Insertion;
17830
17831 // Handle special cases where the lower or upper half is UNDEF.
17832 if (SDValue V =
17833 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17834 return V;
17835
17836 // Check for being able to broadcast a single element.
17837 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17838 Subtarget, DAG))
17839 return Broadcast;
17840
17841 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17842 // Try using bit ops for masking and blending before falling back to
17843 // splitting.
17844 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17845 Subtarget, DAG))
17846 return V;
17847 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17848 return V;
17849
17850 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17851 }
17852
17853 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17854 if (!Subtarget.hasBWI())
17855 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17856 /*SimpleOnly*/ false);
17857
17858 V1 = DAG.getBitcast(MVT::v32i16, V1);
17859 V2 = DAG.getBitcast(MVT::v32i16, V2);
17860 return DAG.getBitcast(VT,
17861 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17862 }
17863
17864 // Dispatch to each element type for lowering. If we don't have support for
17865 // specific element type shuffles at 512 bits, immediately split them and
17866 // lower them. Each lowering routine of a given type is allowed to assume that
17867 // the requisite ISA extensions for that element type are available.
17868 switch (VT.SimpleTy) {
17869 case MVT::v8f64:
17870 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17871 case MVT::v16f32:
17872 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17873 case MVT::v8i64:
17874 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17875 case MVT::v16i32:
17876 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17877 case MVT::v32i16:
17878 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17879 case MVT::v64i8:
17880 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17881
17882 default:
17883 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17884 }
17885}
17886
17888 MVT VT, SDValue V1, SDValue V2,
17889 const X86Subtarget &Subtarget,
17890 SelectionDAG &DAG) {
17891 // Shuffle should be unary.
17892 if (!V2.isUndef())
17893 return SDValue();
17894
17895 int ShiftAmt = -1;
17896 int NumElts = Mask.size();
17897 for (int i = 0; i != NumElts; ++i) {
17898 int M = Mask[i];
17899 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17900 "Unexpected mask index.");
17901 if (M < 0)
17902 continue;
17903
17904 // The first non-undef element determines our shift amount.
17905 if (ShiftAmt < 0) {
17906 ShiftAmt = M - i;
17907 // Need to be shifting right.
17908 if (ShiftAmt <= 0)
17909 return SDValue();
17910 }
17911 // All non-undef elements must shift by the same amount.
17912 if (ShiftAmt != M - i)
17913 return SDValue();
17914 }
17915 assert(ShiftAmt >= 0 && "All undef?");
17916
17917 // Great we found a shift right.
17918 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17919 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17920 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17921 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17922 DAG.getVectorIdxConstant(0, DL));
17923}
17924
17925// Determine if this shuffle can be implemented with a KSHIFT instruction.
17926// Returns the shift amount if possible or -1 if not. This is a simplified
17927// version of matchShuffleAsShift.
17928static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17929 int MaskOffset, const APInt &Zeroable) {
17930 int Size = Mask.size();
17931
17932 auto CheckZeros = [&](int Shift, bool Left) {
17933 for (int j = 0; j < Shift; ++j)
17934 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17935 return false;
17936
17937 return true;
17938 };
17939
17940 auto MatchShift = [&](int Shift, bool Left) {
17941 unsigned Pos = Left ? Shift : 0;
17942 unsigned Low = Left ? 0 : Shift;
17943 unsigned Len = Size - Shift;
17944 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17945 };
17946
17947 for (int Shift = 1; Shift != Size; ++Shift)
17948 for (bool Left : {true, false})
17949 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17951 return Shift;
17952 }
17953
17954 return -1;
17955}
17956
17957
17958// Lower vXi1 vector shuffles.
17959// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17960// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17961// vector, shuffle and then truncate it back.
17963 MVT VT, SDValue V1, SDValue V2,
17964 const APInt &Zeroable,
17965 const X86Subtarget &Subtarget,
17966 SelectionDAG &DAG) {
17967 assert(Subtarget.hasAVX512() &&
17968 "Cannot lower 512-bit vectors w/o basic ISA!");
17969
17970 int NumElts = Mask.size();
17971 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17972
17973 // Try to recognize shuffles that are just padding a subvector with zeros.
17974 int SubvecElts = 0;
17975 int Src = -1;
17976 for (int i = 0; i != NumElts; ++i) {
17977 if (Mask[i] >= 0) {
17978 // Grab the source from the first valid mask. All subsequent elements need
17979 // to use this same source.
17980 if (Src < 0)
17981 Src = Mask[i] / NumElts;
17982 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17983 break;
17984 }
17985
17986 ++SubvecElts;
17987 }
17988 assert(SubvecElts != NumElts && "Identity shuffle?");
17989
17990 // Clip to a power 2.
17991 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17992
17993 // Make sure the number of zeroable bits in the top at least covers the bits
17994 // not covered by the subvector.
17995 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17996 assert(Src >= 0 && "Expected a source!");
17997 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17998 SDValue Extract =
17999 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18000 DAG.getVectorIdxConstant(0, DL));
18001 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18002 DAG.getConstant(0, DL, VT), Extract,
18003 DAG.getVectorIdxConstant(0, DL));
18004 }
18005
18006 // Try a simple shift right with undef elements. Later we'll try with zeros.
18007 if (SDValue Shift =
18008 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18009 return Shift;
18010
18011 // Try to match KSHIFTs.
18012 unsigned Offset = 0;
18013 for (SDValue V : {V1, V2}) {
18014 unsigned Opcode;
18015 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18016 if (ShiftAmt >= 0) {
18017 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18018 MVT WideVT = Res.getSimpleValueType();
18019 // Widened right shifts need two shifts to ensure we shift in zeroes.
18020 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18021 int WideElts = WideVT.getVectorNumElements();
18022 // Shift left to put the original vector in the MSBs of the new size.
18023 Res =
18024 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18025 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18026 // Increase the shift amount to account for the left shift.
18027 ShiftAmt += WideElts - NumElts;
18028 }
18029
18030 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18031 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18032 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18033 DAG.getVectorIdxConstant(0, DL));
18034 }
18035 Offset += NumElts; // Increment for next iteration.
18036 }
18037
18038 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18039 // ops instead.
18040 // TODO: What other unary shuffles would benefit from this?
18041 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18042 SDValue Op0 = V1.getOperand(0);
18043 SDValue Op1 = V1.getOperand(1);
18045 EVT OpVT = Op0.getValueType();
18046 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18047 return DAG.getSetCC(
18048 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18049 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18050 }
18051
18052 MVT ExtVT;
18053 switch (VT.SimpleTy) {
18054 default:
18055 llvm_unreachable("Expected a vector of i1 elements");
18056 case MVT::v2i1:
18057 ExtVT = MVT::v2i64;
18058 break;
18059 case MVT::v4i1:
18060 ExtVT = MVT::v4i32;
18061 break;
18062 case MVT::v8i1:
18063 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18064 // shuffle.
18065 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18066 break;
18067 case MVT::v16i1:
18068 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18069 // 256-bit operation available.
18070 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18071 break;
18072 case MVT::v32i1:
18073 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18074 // 256-bit operation available.
18075 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18076 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18077 break;
18078 case MVT::v64i1:
18079 // Fall back to scalarization. FIXME: We can do better if the shuffle
18080 // can be partitioned cleanly.
18081 if (!Subtarget.useBWIRegs())
18082 return SDValue();
18083 ExtVT = MVT::v64i8;
18084 break;
18085 }
18086
18087 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18088 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18089
18090 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18091 // i1 was sign extended we can use X86ISD::CVT2MASK.
18092 int NumElems = VT.getVectorNumElements();
18093 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18094 (Subtarget.hasDQI() && (NumElems < 32)))
18095 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18096 Shuffle, ISD::SETGT);
18097
18098 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18099}
18100
18101/// Helper function that returns true if the shuffle mask should be
18102/// commuted to improve canonicalization.
18104 int NumElements = Mask.size();
18105
18106 int NumV1Elements = 0, NumV2Elements = 0;
18107 for (int M : Mask)
18108 if (M < 0)
18109 continue;
18110 else if (M < NumElements)
18111 ++NumV1Elements;
18112 else
18113 ++NumV2Elements;
18114
18115 // Commute the shuffle as needed such that more elements come from V1 than
18116 // V2. This allows us to match the shuffle pattern strictly on how many
18117 // elements come from V1 without handling the symmetric cases.
18118 if (NumV2Elements > NumV1Elements)
18119 return true;
18120
18121 assert(NumV1Elements > 0 && "No V1 indices");
18122
18123 if (NumV2Elements == 0)
18124 return false;
18125
18126 // When the number of V1 and V2 elements are the same, try to minimize the
18127 // number of uses of V2 in the low half of the vector. When that is tied,
18128 // ensure that the sum of indices for V1 is equal to or lower than the sum
18129 // indices for V2. When those are equal, try to ensure that the number of odd
18130 // indices for V1 is lower than the number of odd indices for V2.
18131 if (NumV1Elements == NumV2Elements) {
18132 int LowV1Elements = 0, LowV2Elements = 0;
18133 for (int M : Mask.slice(0, NumElements / 2))
18134 if (M >= NumElements)
18135 ++LowV2Elements;
18136 else if (M >= 0)
18137 ++LowV1Elements;
18138 if (LowV2Elements > LowV1Elements)
18139 return true;
18140 if (LowV2Elements == LowV1Elements) {
18141 int SumV1Indices = 0, SumV2Indices = 0;
18142 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18143 if (Mask[i] >= NumElements)
18144 SumV2Indices += i;
18145 else if (Mask[i] >= 0)
18146 SumV1Indices += i;
18147 if (SumV2Indices < SumV1Indices)
18148 return true;
18149 if (SumV2Indices == SumV1Indices) {
18150 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18151 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18152 if (Mask[i] >= NumElements)
18153 NumV2OddIndices += i % 2;
18154 else if (Mask[i] >= 0)
18155 NumV1OddIndices += i % 2;
18156 if (NumV2OddIndices < NumV1OddIndices)
18157 return true;
18158 }
18159 }
18160 }
18161
18162 return false;
18163}
18164
18166 const X86Subtarget &Subtarget) {
18167 if (!Subtarget.hasAVX512())
18168 return false;
18169
18170 if (!V.getValueType().isSimple())
18171 return false;
18172
18173 MVT VT = V.getSimpleValueType().getScalarType();
18174 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18175 return false;
18176
18177 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18178 // are preferable to blendw/blendvb/masked-mov.
18179 if ((VT == MVT::i16 || VT == MVT::i8) &&
18180 V.getSimpleValueType().getSizeInBits() < 512)
18181 return false;
18182
18183 auto HasMaskOperation = [&](SDValue V) {
18184 // TODO: Currently we only check limited opcode. We probably extend
18185 // it to all binary operation by checking TLI.isBinOp().
18186 switch (V->getOpcode()) {
18187 default:
18188 return false;
18189 case ISD::ADD:
18190 case ISD::SUB:
18191 case ISD::AND:
18192 case ISD::XOR:
18193 case ISD::OR:
18194 case ISD::SMAX:
18195 case ISD::SMIN:
18196 case ISD::UMAX:
18197 case ISD::UMIN:
18198 case ISD::ABS:
18199 case ISD::SHL:
18200 case ISD::SRL:
18201 case ISD::SRA:
18202 case ISD::MUL:
18203 break;
18204 }
18205 if (!V->hasOneUse())
18206 return false;
18207
18208 return true;
18209 };
18210
18211 if (HasMaskOperation(V))
18212 return true;
18213
18214 return false;
18215}
18216
18217// Forward declaration.
18220 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18221 const X86Subtarget &Subtarget);
18222
18223 /// Top-level lowering for x86 vector shuffles.
18224///
18225/// This handles decomposition, canonicalization, and lowering of all x86
18226/// vector shuffles. Most of the specific lowering strategies are encapsulated
18227/// above in helper routines. The canonicalization attempts to widen shuffles
18228/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18229/// s.t. only one of the two inputs needs to be tested, etc.
18231 SelectionDAG &DAG) {
18233 ArrayRef<int> OrigMask = SVOp->getMask();
18234 SDValue V1 = Op.getOperand(0);
18235 SDValue V2 = Op.getOperand(1);
18236 MVT VT = Op.getSimpleValueType();
18237 int NumElements = VT.getVectorNumElements();
18238 SDLoc DL(Op);
18239 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18240
18241 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18242 "Can't lower MMX shuffles");
18243
18244 bool V1IsUndef = V1.isUndef();
18245 bool V2IsUndef = V2.isUndef();
18246 if (V1IsUndef && V2IsUndef)
18247 return DAG.getUNDEF(VT);
18248
18249 // When we create a shuffle node we put the UNDEF node to second operand,
18250 // but in some cases the first operand may be transformed to UNDEF.
18251 // In this case we should just commute the node.
18252 if (V1IsUndef)
18253 return DAG.getCommutedVectorShuffle(*SVOp);
18254
18255 // Check for non-undef masks pointing at an undef vector and make the masks
18256 // undef as well. This makes it easier to match the shuffle based solely on
18257 // the mask.
18258 if (V2IsUndef &&
18259 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18260 SmallVector<int, 8> NewMask(OrigMask);
18261 for (int &M : NewMask)
18262 if (M >= NumElements)
18263 M = -1;
18264 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18265 }
18266
18267 // Check for illegal shuffle mask element index values.
18268 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18269 (void)MaskUpperLimit;
18270 assert(llvm::all_of(OrigMask,
18271 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18272 "Out of bounds shuffle index");
18273
18274 // We actually see shuffles that are entirely re-arrangements of a set of
18275 // zero inputs. This mostly happens while decomposing complex shuffles into
18276 // simple ones. Directly lower these as a buildvector of zeros.
18277 APInt KnownUndef, KnownZero;
18278 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18279
18280 APInt Zeroable = KnownUndef | KnownZero;
18281 if (Zeroable.isAllOnes())
18282 return getZeroVector(VT, Subtarget, DAG, DL);
18283
18284 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18285
18286 // Try to collapse shuffles into using a vector type with fewer elements but
18287 // wider element types. We cap this to not form integers or floating point
18288 // elements wider than 64 bits. It does not seem beneficial to form i128
18289 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18290 SmallVector<int, 16> WidenedMask;
18291 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18292 !canCombineAsMaskOperation(V1, Subtarget) &&
18293 !canCombineAsMaskOperation(V2, Subtarget) &&
18294 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18295 // Shuffle mask widening should not interfere with a broadcast opportunity
18296 // by obfuscating the operands with bitcasts.
18297 // TODO: Avoid lowering directly from this top-level function: make this
18298 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18299 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18300 Subtarget, DAG))
18301 return Broadcast;
18302
18303 MVT NewEltVT = VT.isFloatingPoint()
18306 int NewNumElts = NumElements / 2;
18307 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18308 // Make sure that the new vector type is legal. For example, v2f64 isn't
18309 // legal on SSE1.
18310 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18311 if (V2IsZero) {
18312 // Modify the new Mask to take all zeros from the all-zero vector.
18313 // Choose indices that are blend-friendly.
18314 bool UsedZeroVector = false;
18315 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18316 "V2's non-undef elements are used?!");
18317 for (int i = 0; i != NewNumElts; ++i)
18318 if (WidenedMask[i] == SM_SentinelZero) {
18319 WidenedMask[i] = i + NewNumElts;
18320 UsedZeroVector = true;
18321 }
18322 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18323 // some elements to be undef.
18324 if (UsedZeroVector)
18325 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18326 }
18327 V1 = DAG.getBitcast(NewVT, V1);
18328 V2 = DAG.getBitcast(NewVT, V2);
18329 return DAG.getBitcast(
18330 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18331 }
18332 }
18333
18334 SmallVector<SDValue> Ops = {V1, V2};
18335 SmallVector<int> Mask(OrigMask);
18336
18337 // Canonicalize the shuffle with any horizontal ops inputs.
18338 // NOTE: This may update Ops and Mask.
18340 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18341 return DAG.getBitcast(VT, HOp);
18342
18343 V1 = DAG.getBitcast(VT, Ops[0]);
18344 V2 = DAG.getBitcast(VT, Ops[1]);
18345 assert(NumElements == (int)Mask.size() &&
18346 "canonicalizeShuffleMaskWithHorizOp "
18347 "shouldn't alter the shuffle mask size");
18348
18349 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18350 // These will be materialized uniformly anyway, so make splat matching easier.
18351 // TODO: Allow all int constants?
18352 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18353 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18354 BitVector Undefs;
18355 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18356 if (Undefs.any() &&
18359 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18360 }
18361 }
18362 }
18363 return V;
18364 };
18365 V1 = CanonicalizeConstant(V1);
18366 V2 = CanonicalizeConstant(V2);
18367
18368 // Commute the shuffle if it will improve canonicalization.
18371 std::swap(V1, V2);
18372 }
18373
18374 // For each vector width, delegate to a specialized lowering routine.
18375 if (VT.is128BitVector())
18376 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18377
18378 if (VT.is256BitVector())
18379 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18380
18381 if (VT.is512BitVector())
18382 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18383
18384 if (Is1BitVector)
18385 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18386
18387 llvm_unreachable("Unimplemented!");
18388}
18389
18390// As legal vpcompress instructions depend on various AVX512 extensions, try to
18391// convert illegal vector sizes to legal ones to avoid expansion.
18393 SelectionDAG &DAG) {
18394 assert(Subtarget.hasAVX512() &&
18395 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18396
18397 SDLoc DL(Op);
18398 SDValue Vec = Op.getOperand(0);
18399 SDValue Mask = Op.getOperand(1);
18400 SDValue Passthru = Op.getOperand(2);
18401
18402 EVT VecVT = Vec.getValueType();
18403 EVT ElementVT = VecVT.getVectorElementType();
18404 unsigned NumElements = VecVT.getVectorNumElements();
18405 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18406 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18407
18408 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18409 // compressed as 512-bit vectors in AVX512F.
18410 if (NumVecBits != 128 && NumVecBits != 256)
18411 return SDValue();
18412
18413 if (NumElementBits == 32 || NumElementBits == 64) {
18414 unsigned NumLargeElements = 512 / NumElementBits;
18415 MVT LargeVecVT =
18416 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18417 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18418
18419 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18420 DAG, DL);
18421 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18422 Subtarget, DAG, DL);
18423 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18424 : widenSubVector(LargeVecVT, Passthru,
18425 /*ZeroNewElements=*/false,
18426 Subtarget, DAG, DL);
18427
18428 SDValue Compressed =
18429 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18430 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18431 DAG.getConstant(0, DL, MVT::i64));
18432 }
18433
18434 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18435 VecVT == MVT::v16i16) {
18436 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18437 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18438
18439 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18440 Passthru = Passthru.isUndef()
18441 ? DAG.getUNDEF(LargeVecVT)
18442 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18443
18444 SDValue Compressed =
18445 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18446 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18447 }
18448
18449 return SDValue();
18450}
18451
18452/// Try to lower a VSELECT instruction to a vector shuffle.
18454 const X86Subtarget &Subtarget,
18455 SelectionDAG &DAG) {
18456 SDValue Cond = Op.getOperand(0);
18457 SDValue LHS = Op.getOperand(1);
18458 SDValue RHS = Op.getOperand(2);
18459 MVT VT = Op.getSimpleValueType();
18460
18461 // Only non-legal VSELECTs reach this lowering, convert those into generic
18462 // shuffles and re-use the shuffle lowering path for blends.
18466 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18467 }
18468
18469 return SDValue();
18470}
18471
18472SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18473 SDValue Cond = Op.getOperand(0);
18474 SDValue LHS = Op.getOperand(1);
18475 SDValue RHS = Op.getOperand(2);
18476
18477 SDLoc dl(Op);
18478 MVT VT = Op.getSimpleValueType();
18479 if (isSoftF16(VT, Subtarget)) {
18480 MVT NVT = VT.changeVectorElementTypeToInteger();
18481 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18482 DAG.getBitcast(NVT, LHS),
18483 DAG.getBitcast(NVT, RHS)));
18484 }
18485
18486 // A vselect where all conditions and data are constants can be optimized into
18487 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18491 return SDValue();
18492
18493 // Try to lower this to a blend-style vector shuffle. This can handle all
18494 // constant condition cases.
18495 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18496 return BlendOp;
18497
18498 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18499 // with patterns on the mask registers on AVX-512.
18500 MVT CondVT = Cond.getSimpleValueType();
18501 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18502 if (CondEltSize == 1)
18503 return Op;
18504
18505 // Variable blends are only legal from SSE4.1 onward.
18506 if (!Subtarget.hasSSE41())
18507 return SDValue();
18508
18509 unsigned EltSize = VT.getScalarSizeInBits();
18510 unsigned NumElts = VT.getVectorNumElements();
18511
18512 // Expand v32i16/v64i8 without BWI.
18513 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18514 return SDValue();
18515
18516 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18517 // into an i1 condition so that we can use the mask-based 512-bit blend
18518 // instructions.
18519 if (VT.getSizeInBits() == 512) {
18520 // Build a mask by testing the condition against zero.
18521 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18522 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18523 DAG.getConstant(0, dl, CondVT),
18524 ISD::SETNE);
18525 // Now return a new VSELECT using the mask.
18526 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18527 }
18528
18529 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18530 if (CondEltSize != EltSize) {
18531 // If we don't have a sign splat, rely on the expansion.
18532 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18533 return SDValue();
18534
18535 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18536 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18537 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18538 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18539 }
18540
18541 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18542 // are free to split, then better to split before expanding the
18543 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18544 // TODO: This is very similar to narrowVectorSelect.
18545 // TODO: Add Load splitting to isFreeToSplitVector ?
18546 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18547 !Subtarget.hasXOP()) {
18548 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18549 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18550 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18551 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18552 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18553 if (FreeCond && (FreeLHS || FreeRHS))
18554 return splitVectorOp(Op, DAG, dl);
18555 }
18556
18557 // Only some types will be legal on some subtargets. If we can emit a legal
18558 // VSELECT-matching blend, return Op, and but if we need to expand, return
18559 // a null value.
18560 switch (VT.SimpleTy) {
18561 default:
18562 // Most of the vector types have blends past SSE4.1.
18563 return Op;
18564
18565 case MVT::v32i8:
18566 // The byte blends for AVX vectors were introduced only in AVX2.
18567 if (Subtarget.hasAVX2())
18568 return Op;
18569
18570 return SDValue();
18571
18572 case MVT::v8i16:
18573 case MVT::v16i16:
18574 case MVT::v8f16:
18575 case MVT::v16f16: {
18576 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18577 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18578 Cond = DAG.getBitcast(CastVT, Cond);
18579 LHS = DAG.getBitcast(CastVT, LHS);
18580 RHS = DAG.getBitcast(CastVT, RHS);
18581 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18582 return DAG.getBitcast(VT, Select);
18583 }
18584 }
18585}
18586
18588 MVT VT = Op.getSimpleValueType();
18589 SDValue Vec = Op.getOperand(0);
18590 SDValue Idx = Op.getOperand(1);
18591 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18592 SDLoc dl(Op);
18593
18595 return SDValue();
18596
18597 if (VT.getSizeInBits() == 8) {
18598 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18599 // we're going to zero extend the register or fold the store.
18602 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18603 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18604 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18605
18606 unsigned IdxVal = Idx->getAsZExtVal();
18607 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18608 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18609 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18610 }
18611
18612 if (VT == MVT::f32) {
18613 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18614 // the result back to FR32 register. It's only worth matching if the
18615 // result has a single use which is a store or a bitcast to i32. And in
18616 // the case of a store, it's not worth it if the index is a constant 0,
18617 // because a MOVSSmr can be used instead, which is smaller and faster.
18618 if (!Op.hasOneUse())
18619 return SDValue();
18620 SDNode *User = *Op.getNode()->user_begin();
18621 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18622 (User->getOpcode() != ISD::BITCAST ||
18623 User->getValueType(0) != MVT::i32))
18624 return SDValue();
18625 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18626 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18627 return DAG.getBitcast(MVT::f32, Extract);
18628 }
18629
18630 if (VT == MVT::i32 || VT == MVT::i64)
18631 return Op;
18632
18633 return SDValue();
18634}
18635
18636/// Extract one bit from mask vector, like v16i1 or v8i1.
18637/// AVX-512 feature.
18639 const X86Subtarget &Subtarget) {
18640 SDValue Vec = Op.getOperand(0);
18641 SDLoc dl(Vec);
18642 MVT VecVT = Vec.getSimpleValueType();
18643 SDValue Idx = Op.getOperand(1);
18644 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18645 MVT EltVT = Op.getSimpleValueType();
18646
18647 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18648 "Unexpected vector type in ExtractBitFromMaskVector");
18649
18650 // variable index can't be handled in mask registers,
18651 // extend vector to VR512/128
18652 if (!IdxC) {
18653 unsigned NumElts = VecVT.getVectorNumElements();
18654 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18655 // than extending to 128/256bit.
18656 if (NumElts == 1) {
18657 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18659 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18660 }
18661 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18662 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18663 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18664 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18665 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18666 }
18667
18668 unsigned IdxVal = IdxC->getZExtValue();
18669 if (IdxVal == 0) // the operation is legal
18670 return Op;
18671
18672 // Extend to natively supported kshift.
18673 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18674
18675 // Use kshiftr instruction to move to the lower element.
18676 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18677 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18678
18679 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18680 DAG.getVectorIdxConstant(0, dl));
18681}
18682
18683// Helper to find all the extracted elements from a vector.
18685 MVT VT = N->getSimpleValueType(0);
18686 unsigned NumElts = VT.getVectorNumElements();
18687 APInt DemandedElts = APInt::getZero(NumElts);
18688 for (SDNode *User : N->users()) {
18689 switch (User->getOpcode()) {
18690 case X86ISD::PEXTRB:
18691 case X86ISD::PEXTRW:
18694 DemandedElts.setAllBits();
18695 return DemandedElts;
18696 }
18697 DemandedElts.setBit(User->getConstantOperandVal(1));
18698 break;
18699 case ISD::BITCAST: {
18700 if (!User->getValueType(0).isSimple() ||
18701 !User->getValueType(0).isVector()) {
18702 DemandedElts.setAllBits();
18703 return DemandedElts;
18704 }
18705 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18706 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18707 break;
18708 }
18709 default:
18710 DemandedElts.setAllBits();
18711 return DemandedElts;
18712 }
18713 }
18714 return DemandedElts;
18715}
18716
18717SDValue
18718X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18719 SelectionDAG &DAG) const {
18720 SDLoc dl(Op);
18721 SDValue Vec = Op.getOperand(0);
18722 MVT VecVT = Vec.getSimpleValueType();
18723 SDValue Idx = Op.getOperand(1);
18724 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18725
18726 if (VecVT.getVectorElementType() == MVT::i1)
18727 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18728
18729 if (!IdxC) {
18730 // Its more profitable to go through memory (1 cycles throughput)
18731 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18732 // IACA tool was used to get performance estimation
18733 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18734 //
18735 // example : extractelement <16 x i8> %a, i32 %i
18736 //
18737 // Block Throughput: 3.00 Cycles
18738 // Throughput Bottleneck: Port5
18739 //
18740 // | Num Of | Ports pressure in cycles | |
18741 // | Uops | 0 - DV | 5 | 6 | 7 | |
18742 // ---------------------------------------------
18743 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18744 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18745 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18746 // Total Num Of Uops: 4
18747 //
18748 //
18749 // Block Throughput: 1.00 Cycles
18750 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18751 //
18752 // | | Ports pressure in cycles | |
18753 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18754 // ---------------------------------------------------------
18755 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18756 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18757 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18758 // Total Num Of Uops: 4
18759
18760 return SDValue();
18761 }
18762
18763 unsigned IdxVal = IdxC->getZExtValue();
18764
18765 // If this is a 256-bit vector result, first extract the 128-bit vector and
18766 // then extract the element from the 128-bit vector.
18767 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18768 // Get the 128-bit vector.
18769 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18770 MVT EltVT = VecVT.getVectorElementType();
18771
18772 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18773 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18774
18775 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18776 // this can be done with a mask.
18777 IdxVal &= ElemsPerChunk - 1;
18778 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18779 DAG.getVectorIdxConstant(IdxVal, dl));
18780 }
18781
18782 assert(VecVT.is128BitVector() && "Unexpected vector length");
18783
18784 MVT VT = Op.getSimpleValueType();
18785
18786 if (VT == MVT::i16) {
18787 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18788 // we're going to zero extend the register or fold the store (SSE41 only).
18789 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18790 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18791 if (Subtarget.hasFP16())
18792 return Op;
18793
18794 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18795 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18796 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18797 }
18798
18799 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18800 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18801 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18802 }
18803
18804 if (Subtarget.hasSSE41())
18805 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18806 return Res;
18807
18808 // Only extract a single element from a v16i8 source - determine the common
18809 // DWORD/WORD that all extractions share, and extract the sub-byte.
18810 // TODO: Add QWORD MOVQ extraction?
18811 if (VT == MVT::i8) {
18812 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18813 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18814
18815 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18816 int DWordIdx = IdxVal / 4;
18817 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18818 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18819 DAG.getBitcast(MVT::v4i32, Vec),
18820 DAG.getVectorIdxConstant(DWordIdx, dl));
18821 int ShiftVal = (IdxVal % 4) * 8;
18822 if (ShiftVal != 0)
18823 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18824 DAG.getConstant(ShiftVal, dl, MVT::i8));
18825 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18826 }
18827
18828 int WordIdx = IdxVal / 2;
18829 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18830 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18831 DAG.getBitcast(MVT::v8i16, Vec),
18832 DAG.getVectorIdxConstant(WordIdx, dl));
18833 int ShiftVal = (IdxVal % 2) * 8;
18834 if (ShiftVal != 0)
18835 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18836 DAG.getConstant(ShiftVal, dl, MVT::i8));
18837 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18838 }
18839 }
18840
18841 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18842 if (IdxVal == 0)
18843 return Op;
18844
18845 // Shuffle the element to the lowest element, then movss or movsh.
18846 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18847 Mask[0] = static_cast<int>(IdxVal);
18848 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18849 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18850 DAG.getVectorIdxConstant(0, dl));
18851 }
18852
18853 if (VT.getSizeInBits() == 64) {
18854 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18855 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18856 // to match extract_elt for f64.
18857 if (IdxVal == 0)
18858 return Op;
18859
18860 // UNPCKHPD the element to the lowest double word, then movsd.
18861 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18862 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18863 int Mask[2] = { 1, -1 };
18864 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18866 DAG.getVectorIdxConstant(0, dl));
18867 }
18868
18869 return SDValue();
18870}
18871
18872/// Insert one bit to mask vector, like v16i1 or v8i1.
18873/// AVX-512 feature.
18875 const X86Subtarget &Subtarget) {
18876 SDLoc dl(Op);
18877 SDValue Vec = Op.getOperand(0);
18878 SDValue Elt = Op.getOperand(1);
18879 SDValue Idx = Op.getOperand(2);
18880 MVT VecVT = Vec.getSimpleValueType();
18881
18882 if (!isa<ConstantSDNode>(Idx)) {
18883 // Non constant index. Extend source and destination,
18884 // insert element and then truncate the result.
18885 unsigned NumElts = VecVT.getVectorNumElements();
18886 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18887 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18888 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18889 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18890 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18891 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18892 }
18893
18894 // Copy into a k-register, extract to v1i1 and insert_subvector.
18895 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18896 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18897}
18898
18899SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18900 SelectionDAG &DAG) const {
18901 MVT VT = Op.getSimpleValueType();
18902 MVT EltVT = VT.getVectorElementType();
18903 unsigned NumElts = VT.getVectorNumElements();
18904 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18905
18906 if (EltVT == MVT::i1)
18907 return InsertBitToMaskVector(Op, DAG, Subtarget);
18908
18909 SDLoc dl(Op);
18910 SDValue N0 = Op.getOperand(0);
18911 SDValue N1 = Op.getOperand(1);
18912 SDValue N2 = Op.getOperand(2);
18913 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18914
18915 if (EltVT == MVT::bf16) {
18916 MVT IVT = VT.changeVectorElementTypeToInteger();
18917 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18918 DAG.getBitcast(IVT, N0),
18919 DAG.getBitcast(MVT::i16, N1), N2);
18920 return DAG.getBitcast(VT, Res);
18921 }
18922
18923 if (!N2C) {
18924 // Variable insertion indices, usually we're better off spilling to stack,
18925 // but AVX512 can use a variable compare+select by comparing against all
18926 // possible vector indices, and FP insertion has less gpr->simd traffic.
18927 if (!(Subtarget.hasBWI() ||
18928 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18929 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18930 return SDValue();
18931
18932 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18933 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18934 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18935 return SDValue();
18936
18937 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18938 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18939 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18940
18941 SmallVector<SDValue, 16> RawIndices;
18942 for (unsigned I = 0; I != NumElts; ++I)
18943 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18944 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18945
18946 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18947 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18949 }
18950
18951 if (N2C->getAPIntValue().uge(NumElts))
18952 return SDValue();
18953 uint64_t IdxVal = N2C->getZExtValue();
18954
18955 bool IsZeroElt = X86::isZeroNode(N1);
18956 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18957
18958 if (IsZeroElt || IsAllOnesElt) {
18959 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18960 // We don't deal with i8 0 since it appears to be handled elsewhere.
18961 if (IsAllOnesElt &&
18962 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18963 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18964 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18965 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18966 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18967 CstVectorElts[IdxVal] = OnesCst;
18968 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18969 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18970 }
18971 // See if we can do this more efficiently with a blend shuffle with a
18972 // rematerializable vector.
18973 if (Subtarget.hasSSE41() &&
18974 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18975 SmallVector<int, 8> BlendMask;
18976 for (unsigned i = 0; i != NumElts; ++i)
18977 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18978 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18979 : getOnesVector(VT, DAG, dl);
18980 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18981 }
18982 }
18983
18984 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18985 // into that, and then insert the subvector back into the result.
18986 if (VT.is256BitVector() || VT.is512BitVector()) {
18987 // With a 256-bit vector, we can insert into the zero element efficiently
18988 // using a blend if we have AVX or AVX2 and the right data type.
18989 if (VT.is256BitVector() && IdxVal == 0) {
18990 // TODO: It is worthwhile to cast integer to floating point and back
18991 // and incur a domain crossing penalty if that's what we'll end up
18992 // doing anyway after extracting to a 128-bit vector.
18993 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18994 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18995 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18996 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18997 DAG.getTargetConstant(1, dl, MVT::i8));
18998 }
18999 }
19000
19001 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19002 assert(isPowerOf2_32(NumEltsIn128) &&
19003 "Vectors will always have power-of-two number of elements.");
19004
19005 // If we are not inserting into the low 128-bit vector chunk,
19006 // then prefer the broadcast+blend sequence.
19007 // FIXME: relax the profitability check iff all N1 uses are insertions.
19008 if (IdxVal >= NumEltsIn128 &&
19009 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19010 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19011 X86::mayFoldLoad(N1, Subtarget)))) {
19012 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19013 SmallVector<int, 8> BlendMask;
19014 for (unsigned i = 0; i != NumElts; ++i)
19015 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19016 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19017 }
19018
19019 // Get the desired 128-bit vector chunk.
19020 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19021
19022 // Insert the element into the desired chunk.
19023 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19024 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19025
19026 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19027 DAG.getVectorIdxConstant(IdxIn128, dl));
19028
19029 // Insert the changed part back into the bigger vector
19030 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19031 }
19032 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19033
19034 // This will be just movw/movd/movq/movsh/movss/movsd.
19035 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19036 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19037 EltVT == MVT::f16 || EltVT == MVT::i64) {
19038 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19039 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19040 }
19041
19042 // We can't directly insert an i8 or i16 into a vector, so zero extend
19043 // it to i32 first.
19044 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19045 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19046 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19047 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19048 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19049 return DAG.getBitcast(VT, N1);
19050 }
19051 }
19052
19053 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19054 // argument. SSE41 required for pinsrb.
19055 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19056 unsigned Opc;
19057 if (VT == MVT::v8i16) {
19058 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19060 } else {
19061 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19062 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19064 }
19065
19066 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19067 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19068 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19069 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19070 }
19071
19072 if (Subtarget.hasSSE41()) {
19073 if (EltVT == MVT::f32) {
19074 // Bits [7:6] of the constant are the source select. This will always be
19075 // zero here. The DAG Combiner may combine an extract_elt index into
19076 // these bits. For example (insert (extract, 3), 2) could be matched by
19077 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19078 // Bits [5:4] of the constant are the destination select. This is the
19079 // value of the incoming immediate.
19080 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19081 // combine either bitwise AND or insert of float 0.0 to set these bits.
19082
19083 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19084 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19085 // If this is an insertion of 32-bits into the low 32-bits of
19086 // a vector, we prefer to generate a blend with immediate rather
19087 // than an insertps. Blends are simpler operations in hardware and so
19088 // will always have equal or better performance than insertps.
19089 // But if optimizing for size and there's a load folding opportunity,
19090 // generate insertps because blendps does not have a 32-bit memory
19091 // operand form.
19092 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19093 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19094 DAG.getTargetConstant(1, dl, MVT::i8));
19095 }
19096 // Create this as a scalar to vector..
19097 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19098 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19099 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19100 }
19101
19102 // PINSR* works with constant index.
19103 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19104 return Op;
19105 }
19106
19107 return SDValue();
19108}
19109
19111 SelectionDAG &DAG) {
19112 SDLoc dl(Op);
19113 MVT OpVT = Op.getSimpleValueType();
19114
19115 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19116 // combines.
19117 if (X86::isZeroNode(Op.getOperand(0)))
19118 return getZeroVector(OpVT, Subtarget, DAG, dl);
19119
19120 // If this is a 256-bit vector result, first insert into a 128-bit
19121 // vector and then insert into the 256-bit vector.
19122 if (!OpVT.is128BitVector()) {
19123 // Insert into a 128-bit vector.
19124 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19126 OpVT.getVectorNumElements() / SizeFactor);
19127
19128 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19129
19130 // Insert the 128-bit vector.
19131 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19132 }
19133 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19134 "Expected an SSE type!");
19135
19136 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19137 // tblgen.
19138 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19139 return Op;
19140
19141 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19142 return DAG.getBitcast(
19143 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19144}
19145
19146// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19147// simple superregister reference or explicit instructions to insert
19148// the upper bits of a vector.
19150 SelectionDAG &DAG) {
19151 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19152
19153 return insert1BitVector(Op, DAG, Subtarget);
19154}
19155
19157 SelectionDAG &DAG) {
19158 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19159 "Only vXi1 extract_subvectors need custom lowering");
19160
19161 SDLoc dl(Op);
19162 SDValue Vec = Op.getOperand(0);
19163 uint64_t IdxVal = Op.getConstantOperandVal(1);
19164
19165 if (IdxVal == 0) // the operation is legal
19166 return Op;
19167
19168 // Extend to natively supported kshift.
19169 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19170
19171 // Shift to the LSB.
19172 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19173 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19174
19175 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19176 DAG.getVectorIdxConstant(0, dl));
19177}
19178
19179// Returns the appropriate wrapper opcode for a global reference.
19180unsigned X86TargetLowering::getGlobalWrapperKind(
19181 const GlobalValue *GV, const unsigned char OpFlags) const {
19182 // References to absolute symbols are never PC-relative.
19183 if (GV && GV->isAbsoluteSymbolRef())
19184 return X86ISD::Wrapper;
19185
19186 // The following OpFlags under RIP-rel PIC use RIP.
19187 if (Subtarget.isPICStyleRIPRel() &&
19188 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19189 OpFlags == X86II::MO_DLLIMPORT))
19190 return X86ISD::WrapperRIP;
19191
19192 // GOTPCREL references must always use RIP.
19193 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19194 return X86ISD::WrapperRIP;
19195
19196 return X86ISD::Wrapper;
19197}
19198
19199// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19200// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19201// one of the above mentioned nodes. It has to be wrapped because otherwise
19202// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19203// be used to form addressing mode. These wrapped nodes will be selected
19204// into MOV32ri.
19205SDValue
19206X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19207 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19208
19209 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19210 // global base reg.
19211 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19212
19213 auto PtrVT = getPointerTy(DAG.getDataLayout());
19215 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19216 SDLoc DL(CP);
19217 Result =
19218 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19219 // With PIC, the address is actually $g + Offset.
19220 if (OpFlag) {
19221 Result =
19222 DAG.getNode(ISD::ADD, DL, PtrVT,
19223 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19224 }
19225
19226 return Result;
19227}
19228
19229SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19230 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19231
19232 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19233 // global base reg.
19234 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19235
19236 EVT PtrVT = Op.getValueType();
19237 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19238 SDLoc DL(JT);
19239 Result =
19240 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19241
19242 // With PIC, the address is actually $g + Offset.
19243 if (OpFlag)
19244 Result =
19245 DAG.getNode(ISD::ADD, DL, PtrVT,
19246 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19247
19248 return Result;
19249}
19250
19251SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19252 SelectionDAG &DAG) const {
19253 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19254}
19255
19256SDValue
19257X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19258 // Create the TargetBlockAddressAddress node.
19259 unsigned char OpFlags =
19260 Subtarget.classifyBlockAddressReference();
19261 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19262 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19263 SDLoc dl(Op);
19264 EVT PtrVT = Op.getValueType();
19265 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19266 Result =
19267 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19268
19269 // With PIC, the address is actually $g + Offset.
19270 if (isGlobalRelativeToPICBase(OpFlags)) {
19271 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19272 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19273 }
19274
19275 return Result;
19276}
19277
19278/// Creates target global address or external symbol nodes for calls or
19279/// other uses.
19280SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19281 bool ForCall,
19282 bool *IsImpCall) const {
19283 // Unpack the global address or external symbol.
19284 SDLoc dl(Op);
19285 const GlobalValue *GV = nullptr;
19286 int64_t Offset = 0;
19287 const char *ExternalSym = nullptr;
19288 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19289 GV = G->getGlobal();
19290 Offset = G->getOffset();
19291 } else {
19292 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19293 ExternalSym = ES->getSymbol();
19294 }
19295
19296 // Calculate some flags for address lowering.
19298 unsigned char OpFlags;
19299 if (ForCall)
19300 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19301 else
19302 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19303 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19304 bool NeedsLoad = isGlobalStubReference(OpFlags);
19305
19307 EVT PtrVT = Op.getValueType();
19309
19310 if (GV) {
19311 // Create a target global address if this is a global. If possible, fold the
19312 // offset into the global address reference. Otherwise, ADD it on later.
19313 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19314 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19315 // relocation will compute to a negative value, which is invalid.
19316 int64_t GlobalOffset = 0;
19317 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19319 std::swap(GlobalOffset, Offset);
19320 }
19321 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19322 } else {
19323 // If this is not a global address, this must be an external symbol.
19324 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19325 }
19326
19327 // If this is a direct call, avoid the wrapper if we don't need to do any
19328 // loads or adds. This allows SDAG ISel to match direct calls.
19329 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19330 return Result;
19331
19332 // If Import Call Optimization is enabled and this is an imported function
19333 // then make a note of it and return the global address without wrapping.
19334 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19335 Mod.getModuleFlag("import-call-optimization")) {
19336 assert(ForCall && "Should only enable import call optimization if we are "
19337 "lowering a call");
19338 *IsImpCall = true;
19339 return Result;
19340 }
19341
19342 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19343
19344 // With PIC, the address is actually $g + Offset.
19345 if (HasPICReg) {
19346 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19347 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19348 }
19349
19350 // For globals that require a load from a stub to get the address, emit the
19351 // load.
19352 if (NeedsLoad)
19353 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19355
19356 // If there was a non-zero offset that we didn't fold, create an explicit
19357 // addition for it.
19358 if (Offset != 0)
19359 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19360 DAG.getSignedConstant(Offset, dl, PtrVT));
19361
19362 return Result;
19363}
19364
19365SDValue
19366X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19367 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19368}
19369
19371 const EVT PtrVT, unsigned ReturnReg,
19372 unsigned char OperandFlags,
19373 bool LoadGlobalBaseReg = false,
19374 bool LocalDynamic = false) {
19376 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19377 SDLoc dl(GA);
19378 SDValue TGA;
19379 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19380 SDValue Chain = DAG.getEntryNode();
19381 SDValue Ret;
19382 if (LocalDynamic && UseTLSDESC) {
19383 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19384 // Reuse existing GetTLSADDR node if we can find it.
19385 if (TGA->hasOneUse()) {
19386 // TLSDESC uses TGA.
19387 SDNode *TLSDescOp = *TGA->user_begin();
19388 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19389 "Unexpected TLSDESC DAG");
19390 // CALLSEQ_END uses TGA via a chain and glue.
19391 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19392 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19393 "Unexpected TLSDESC DAG");
19394 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19395 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19396 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19397 "Unexpected TLSDESC DAG");
19398 Ret = SDValue(CopyFromRegOp, 0);
19399 }
19400 } else {
19401 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19402 GA->getOffset(), OperandFlags);
19403 }
19404
19405 if (!Ret) {
19406 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19407 : LocalDynamic ? X86ISD::TLSBASEADDR
19409
19410 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19411 if (LoadGlobalBaseReg) {
19412 SDValue InGlue;
19413 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19414 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19415 InGlue);
19416 InGlue = Chain.getValue(1);
19417 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19418 } else {
19419 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19420 }
19421 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19422
19423 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19424 MFI.setHasCalls(true);
19425
19426 SDValue Glue = Chain.getValue(1);
19427 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19428 }
19429
19430 if (!UseTLSDESC)
19431 return Ret;
19432
19433 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19434 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19435
19437 SDValue Offset =
19438 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19440 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19441}
19442
19443// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19444static SDValue
19446 const EVT PtrVT) {
19447 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19448 /*LoadGlobalBaseReg=*/true);
19449}
19450
19451// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19452static SDValue
19454 const EVT PtrVT) {
19455 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19456}
19457
19458// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19459static SDValue
19461 const EVT PtrVT) {
19462 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19463}
19464
19466 SelectionDAG &DAG, const EVT PtrVT,
19467 bool Is64Bit, bool Is64BitLP64) {
19468 SDLoc dl(GA);
19469
19470 // Get the start address of the TLS block for this module.
19474
19475 SDValue Base;
19476 if (Is64Bit) {
19477 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19478 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19479 /*LoadGlobalBaseReg=*/false,
19480 /*LocalDynamic=*/true);
19481 } else {
19482 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19483 /*LoadGlobalBaseReg=*/true,
19484 /*LocalDynamic=*/true);
19485 }
19486
19487 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19488 // of Base.
19489
19490 // Build x@dtpoff.
19491 unsigned char OperandFlags = X86II::MO_DTPOFF;
19492 unsigned WrapperKind = X86ISD::Wrapper;
19493 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19494 GA->getValueType(0),
19495 GA->getOffset(), OperandFlags);
19496 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19497
19498 // Add x@dtpoff with the base.
19499 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19500}
19501
19502// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19504 const EVT PtrVT, TLSModel::Model model,
19505 bool is64Bit, bool isPIC) {
19506 SDLoc dl(GA);
19507
19508 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19511
19512 SDValue ThreadPointer =
19513 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19515
19516 unsigned char OperandFlags = 0;
19517 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19518 // initialexec.
19519 unsigned WrapperKind = X86ISD::Wrapper;
19520 if (model == TLSModel::LocalExec) {
19521 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19522 } else if (model == TLSModel::InitialExec) {
19523 if (is64Bit) {
19524 OperandFlags = X86II::MO_GOTTPOFF;
19525 WrapperKind = X86ISD::WrapperRIP;
19526 } else {
19527 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19528 }
19529 } else {
19530 llvm_unreachable("Unexpected model");
19531 }
19532
19533 // emit "addl x@ntpoff,%eax" (local exec)
19534 // or "addl x@indntpoff,%eax" (initial exec)
19535 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19536 SDValue TGA =
19537 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19538 GA->getOffset(), OperandFlags);
19539 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19540
19541 if (model == TLSModel::InitialExec) {
19542 if (isPIC && !is64Bit) {
19543 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19544 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19545 Offset);
19546 }
19547
19548 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19550 }
19551
19552 // The address of the thread local variable is the add of the thread
19553 // pointer with the offset of the variable.
19554 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19555}
19556
19557SDValue
19558X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19559
19560 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19561
19562 if (DAG.getTarget().useEmulatedTLS())
19563 return LowerToTLSEmulatedModel(GA, DAG);
19564
19565 const GlobalValue *GV = GA->getGlobal();
19566 EVT PtrVT = Op.getValueType();
19567 bool PositionIndependent = isPositionIndependent();
19568
19569 if (Subtarget.isTargetELF()) {
19570 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19571 switch (model) {
19573 if (Subtarget.is64Bit()) {
19574 if (Subtarget.isTarget64BitLP64())
19575 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19576 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19577 }
19578 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19580 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19581 Subtarget.isTarget64BitLP64());
19584 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19585 PositionIndependent);
19586 }
19587 llvm_unreachable("Unknown TLS model.");
19588 }
19589
19590 if (Subtarget.isTargetDarwin()) {
19591 // Darwin only has one model of TLS. Lower to that.
19592 unsigned char OpFlag = 0;
19593 unsigned WrapperKind = 0;
19594
19595 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19596 // global base reg.
19597 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19598 if (PIC32) {
19599 OpFlag = X86II::MO_TLVP_PIC_BASE;
19600 WrapperKind = X86ISD::Wrapper;
19601 } else {
19602 OpFlag = X86II::MO_TLVP;
19603 WrapperKind = X86ISD::WrapperRIP;
19604 }
19605 SDLoc DL(Op);
19607 GA->getValueType(0),
19608 GA->getOffset(), OpFlag);
19609 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19610
19611 // With PIC32, the address is actually $g + Offset.
19612 if (PIC32)
19613 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19614 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19615 Offset);
19616
19617 // Lowering the machine isd will make sure everything is in the right
19618 // location.
19619 SDValue Chain = DAG.getEntryNode();
19620 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19621 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19622 SDValue Args[] = { Chain, Offset };
19623 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19624 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19625
19626 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19627 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19628 MFI.setAdjustsStack(true);
19629
19630 // And our return value (tls address) is in the standard call return value
19631 // location.
19632 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19633 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19634 }
19635
19636 if (Subtarget.isOSWindows()) {
19637 // Just use the implicit TLS architecture
19638 // Need to generate something similar to:
19639 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19640 // ; from TEB
19641 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19642 // mov rcx, qword [rdx+rcx*8]
19643 // mov eax, .tls$:tlsvar
19644 // [rax+rcx] contains the address
19645 // Windows 64bit: gs:0x58
19646 // Windows 32bit: fs:__tls_array
19647
19648 SDLoc dl(GA);
19649 SDValue Chain = DAG.getEntryNode();
19650
19651 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19652 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19653 // use its literal value of 0x2C.
19655 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19657
19658 SDValue TlsArray = Subtarget.is64Bit()
19659 ? DAG.getIntPtrConstant(0x58, dl)
19660 : (Subtarget.isTargetWindowsGNU()
19661 ? DAG.getIntPtrConstant(0x2C, dl)
19662 : DAG.getExternalSymbol("_tls_array", PtrVT));
19663
19665 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19666
19667 SDValue res;
19669 res = ThreadPointer;
19670 } else {
19671 // Load the _tls_index variable
19672 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19673 if (Subtarget.is64Bit())
19674 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19675 MachinePointerInfo(), MVT::i32);
19676 else
19677 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19678
19679 const DataLayout &DL = DAG.getDataLayout();
19680 SDValue Scale =
19681 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19682 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19683
19684 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19685 }
19686
19687 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19688
19689 // Get the offset of start of .tls section
19690 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19691 GA->getValueType(0),
19693 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19694
19695 // The address of the thread local variable is the add of the thread
19696 // pointer with the offset of the variable.
19697 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19698 }
19699
19700 llvm_unreachable("TLS not implemented for this target.");
19701}
19702
19704 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19705 const TargetMachine &TM = getTargetMachine();
19706 TLSModel::Model Model = TM.getTLSModel(&GV);
19707 switch (Model) {
19710 // We can include the %fs segment register in addressing modes.
19711 return true;
19714 // These models do not result in %fs relative addresses unless
19715 // TLS descriptior are used.
19716 //
19717 // Even in the case of TLS descriptors we currently have no way to model
19718 // the difference between %fs access and the computations needed for the
19719 // offset and returning `true` for TLS-desc currently duplicates both
19720 // which is detrimental :-/
19721 return false;
19722 }
19723 }
19724 return false;
19725}
19726
19727/// Lower SRA_PARTS and friends, which return two i32 values
19728/// and take a 2 x i32 value to shift plus a shift amount.
19729/// TODO: Can this be moved to general expansion code?
19731 SDValue Lo, Hi;
19732 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19733 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19734}
19735
19736// Try to use a packed vector operation to handle i64 on 32-bit targets when
19737// AVX512DQ is enabled.
19739 SelectionDAG &DAG,
19740 const X86Subtarget &Subtarget) {
19741 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19742 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19743 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19744 Op.getOpcode() == ISD::UINT_TO_FP) &&
19745 "Unexpected opcode!");
19746 bool IsStrict = Op->isStrictFPOpcode();
19747 unsigned OpNo = IsStrict ? 1 : 0;
19748 SDValue Src = Op.getOperand(OpNo);
19749 MVT SrcVT = Src.getSimpleValueType();
19750 MVT VT = Op.getSimpleValueType();
19751
19752 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19753 (VT != MVT::f32 && VT != MVT::f64))
19754 return SDValue();
19755
19756 // Pack the i64 into a vector, do the operation and extract.
19757
19758 // Using 256-bit to ensure result is 128-bits for f32 case.
19759 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19760 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19761 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19762
19763 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19764 if (IsStrict) {
19765 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19766 {Op.getOperand(0), InVec});
19767 SDValue Chain = CvtVec.getValue(1);
19768 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19769 DAG.getVectorIdxConstant(0, dl));
19770 return DAG.getMergeValues({Value, Chain}, dl);
19771 }
19772
19773 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19774
19775 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19776 DAG.getVectorIdxConstant(0, dl));
19777}
19778
19779// Try to use a packed vector operation to handle i64 on 32-bit targets.
19781 const X86Subtarget &Subtarget) {
19782 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19783 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19784 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19785 Op.getOpcode() == ISD::UINT_TO_FP) &&
19786 "Unexpected opcode!");
19787 bool IsStrict = Op->isStrictFPOpcode();
19788 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19789 MVT SrcVT = Src.getSimpleValueType();
19790 MVT VT = Op.getSimpleValueType();
19791
19792 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19793 return SDValue();
19794
19795 // Pack the i64 into a vector, do the operation and extract.
19796
19797 assert(Subtarget.hasFP16() && "Expected FP16");
19798
19799 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19800 if (IsStrict) {
19801 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19802 {Op.getOperand(0), InVec});
19803 SDValue Chain = CvtVec.getValue(1);
19804 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19805 DAG.getVectorIdxConstant(0, dl));
19806 return DAG.getMergeValues({Value, Chain}, dl);
19807 }
19808
19809 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19810
19811 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19812 DAG.getVectorIdxConstant(0, dl));
19813}
19814
19815static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19816 const X86Subtarget &Subtarget) {
19817 switch (Opcode) {
19818 case ISD::SINT_TO_FP:
19819 // TODO: Handle wider types with AVX/AVX512.
19820 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19821 return false;
19822 // CVTDQ2PS or (V)CVTDQ2PD
19823 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19824
19825 case ISD::UINT_TO_FP:
19826 // TODO: Handle wider types and i64 elements.
19827 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19828 return false;
19829 // VCVTUDQ2PS or VCVTUDQ2PD
19830 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19831
19832 default:
19833 return false;
19834 }
19835}
19836
19837/// Given a scalar cast operation that is extracted from a vector, try to
19838/// vectorize the cast op followed by extraction. This will avoid an expensive
19839/// round-trip between XMM and GPR.
19841 SelectionDAG &DAG,
19842 const X86Subtarget &Subtarget) {
19843 // TODO: This could be enhanced to handle smaller integer types by peeking
19844 // through an extend.
19845 SDValue Extract = Cast.getOperand(0);
19846 MVT DestVT = Cast.getSimpleValueType();
19847 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19848 !isa<ConstantSDNode>(Extract.getOperand(1)))
19849 return SDValue();
19850
19851 // See if we have a 128-bit vector cast op for this type of cast.
19852 SDValue VecOp = Extract.getOperand(0);
19853 MVT FromVT = VecOp.getSimpleValueType();
19854 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19855 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19856 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19857 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19858 return SDValue();
19859
19860 // If we are extracting from a non-zero element, first shuffle the source
19861 // vector to allow extracting from element zero.
19862 if (!isNullConstant(Extract.getOperand(1))) {
19863 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19864 Mask[0] = Extract.getConstantOperandVal(1);
19865 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19866 }
19867 // If the source vector is wider than 128-bits, extract the low part. Do not
19868 // create an unnecessarily wide vector cast op.
19869 if (FromVT != Vec128VT)
19870 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19871
19872 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19873 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19874 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19876 DAG.getVectorIdxConstant(0, DL));
19877}
19878
19879/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19880/// try to vectorize the cast ops. This will avoid an expensive round-trip
19881/// between XMM and GPR.
19882static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19883 SelectionDAG &DAG,
19884 const X86Subtarget &Subtarget) {
19885 // TODO: Allow FP_TO_UINT.
19886 SDValue CastToInt = CastToFP.getOperand(0);
19887 MVT VT = CastToFP.getSimpleValueType();
19888 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19889 return SDValue();
19890
19891 MVT IntVT = CastToInt.getSimpleValueType();
19892 SDValue X = CastToInt.getOperand(0);
19893 MVT SrcVT = X.getSimpleValueType();
19894 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19895 return SDValue();
19896
19897 // See if we have 128-bit vector cast instructions for this type of cast.
19898 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19899 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19900 IntVT != MVT::i32)
19901 return SDValue();
19902
19903 unsigned SrcSize = SrcVT.getSizeInBits();
19904 unsigned IntSize = IntVT.getSizeInBits();
19905 unsigned VTSize = VT.getSizeInBits();
19906 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19907 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19908 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19909
19910 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19911 unsigned ToIntOpcode =
19912 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19913 unsigned ToFPOpcode =
19914 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19915
19916 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19917 //
19918 // We are not defining the high elements (for example, zero them) because
19919 // that could nullify any performance advantage that we hoped to gain from
19920 // this vector op hack. We do not expect any adverse effects (like denorm
19921 // penalties) with cast ops.
19922 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19923 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19924 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19925 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19926 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19927}
19928
19930 SelectionDAG &DAG,
19931 const X86Subtarget &Subtarget) {
19932 bool IsStrict = Op->isStrictFPOpcode();
19933 MVT VT = Op->getSimpleValueType(0);
19934 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19935
19936 if (Subtarget.hasDQI()) {
19937 assert(!Subtarget.hasVLX() && "Unexpected features");
19938
19939 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19940 Src.getSimpleValueType() == MVT::v4i64) &&
19941 "Unsupported custom type");
19942
19943 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19944 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19945 "Unexpected VT!");
19946 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19947
19948 // Need to concat with zero vector for strict fp to avoid spurious
19949 // exceptions.
19950 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19951 : DAG.getUNDEF(MVT::v8i64);
19952 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19953 DAG.getVectorIdxConstant(0, DL));
19954 SDValue Res, Chain;
19955 if (IsStrict) {
19956 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19957 {Op->getOperand(0), Src});
19958 Chain = Res.getValue(1);
19959 } else {
19960 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19961 }
19962
19963 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19964 DAG.getVectorIdxConstant(0, DL));
19965
19966 if (IsStrict)
19967 return DAG.getMergeValues({Res, Chain}, DL);
19968 return Res;
19969 }
19970
19971 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19972 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19973 if (VT != MVT::v4f32 || IsSigned)
19974 return SDValue();
19975
19976 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19977 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19978 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19979 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19980 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19981 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19982 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19983 SmallVector<SDValue, 4> SignCvts(4);
19984 SmallVector<SDValue, 4> Chains(4);
19985 for (int i = 0; i != 4; ++i) {
19986 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19987 DAG.getVectorIdxConstant(i, DL));
19988 if (IsStrict) {
19989 SignCvts[i] =
19990 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19991 {Op.getOperand(0), Elt});
19992 Chains[i] = SignCvts[i].getValue(1);
19993 } else {
19994 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19995 }
19996 }
19997 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19998
19999 SDValue Slow, Chain;
20000 if (IsStrict) {
20001 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20002 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20003 {Chain, SignCvt, SignCvt});
20004 Chain = Slow.getValue(1);
20005 } else {
20006 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20007 }
20008
20009 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20010 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20011
20012 if (IsStrict)
20013 return DAG.getMergeValues({Cvt, Chain}, DL);
20014
20015 return Cvt;
20016}
20017
20019 SelectionDAG &DAG) {
20020 bool IsStrict = Op->isStrictFPOpcode();
20021 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20022 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20023 MVT VT = Op.getSimpleValueType();
20024 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20025
20026 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20027 if (IsStrict)
20028 return DAG.getNode(
20029 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20030 {Chain,
20031 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20032 Rnd});
20033 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20034 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20035}
20036
20037static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20038 const X86Subtarget &Subtarget) {
20039 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20040 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20041 return true;
20042 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20043 return true;
20044 }
20045 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20046 return true;
20047 if (Subtarget.useAVX512Regs()) {
20048 if (VT == MVT::v16i32)
20049 return true;
20050 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20051 return true;
20052 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20053 return true;
20054 }
20055 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20056 (VT == MVT::v2i64 || VT == MVT::v4i64))
20057 return true;
20058 return false;
20059}
20060
20061SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20062 SelectionDAG &DAG) const {
20063 bool IsStrict = Op->isStrictFPOpcode();
20064 unsigned OpNo = IsStrict ? 1 : 0;
20065 SDValue Src = Op.getOperand(OpNo);
20066 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20067 MVT SrcVT = Src.getSimpleValueType();
20068 MVT VT = Op.getSimpleValueType();
20069 SDLoc dl(Op);
20070
20071 if (isSoftF16(VT, Subtarget))
20072 return promoteXINT_TO_FP(Op, dl, DAG);
20073 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20074 return Op;
20075
20076 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20077 return LowerWin64_INT128_TO_FP(Op, DAG);
20078
20079 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20080 return Extract;
20081
20082 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20083 return R;
20084
20085 if (SrcVT.isVector()) {
20086 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20087 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20088 // source for strict FP.
20089 if (IsStrict)
20090 return DAG.getNode(
20091 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20092 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20093 DAG.getUNDEF(SrcVT))});
20094 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20095 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20096 DAG.getUNDEF(SrcVT)));
20097 }
20098 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20099 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20100
20101 return SDValue();
20102 }
20103
20104 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20105 "Unknown SINT_TO_FP to lower!");
20106
20107 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20108
20109 // These are really Legal; return the operand so the caller accepts it as
20110 // Legal.
20111 if (SrcVT == MVT::i32 && UseSSEReg)
20112 return Op;
20113 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20114 return Op;
20115
20116 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20117 return V;
20118 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20119 return V;
20120
20121 // SSE doesn't have an i16 conversion so we need to promote.
20122 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20123 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20124 if (IsStrict)
20125 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20126 {Chain, Ext});
20127
20128 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20129 }
20130
20131 if (VT == MVT::f128 || !Subtarget.hasX87())
20132 return SDValue();
20133
20134 SDValue ValueToStore = Src;
20135 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20136 // Bitcasting to f64 here allows us to do a single 64-bit store from
20137 // an SSE register, avoiding the store forwarding penalty that would come
20138 // with two 32-bit stores.
20139 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20140
20141 unsigned Size = SrcVT.getStoreSize();
20142 Align Alignment(Size);
20143 MachineFunction &MF = DAG.getMachineFunction();
20144 auto PtrVT = getPointerTy(MF.getDataLayout());
20145 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20146 MachinePointerInfo MPI =
20148 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20149 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20150 std::pair<SDValue, SDValue> Tmp =
20151 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20152
20153 if (IsStrict)
20154 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20155
20156 return Tmp.first;
20157}
20158
20159std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20160 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20161 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20162 // Build the FILD
20163 SDVTList Tys;
20164 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20165 if (useSSE)
20166 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20167 else
20168 Tys = DAG.getVTList(DstVT, MVT::Other);
20169
20170 SDValue FILDOps[] = {Chain, Pointer};
20171 SDValue Result =
20172 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20173 Alignment, MachineMemOperand::MOLoad);
20174 Chain = Result.getValue(1);
20175
20176 if (useSSE) {
20178 unsigned SSFISize = DstVT.getStoreSize();
20179 int SSFI =
20180 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20181 auto PtrVT = getPointerTy(MF.getDataLayout());
20182 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20183 Tys = DAG.getVTList(MVT::Other);
20184 SDValue FSTOps[] = {Chain, Result, StackSlot};
20187 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20188
20189 Chain =
20190 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20191 Result = DAG.getLoad(
20192 DstVT, DL, Chain, StackSlot,
20194 Chain = Result.getValue(1);
20195 }
20196
20197 return { Result, Chain };
20198}
20199
20200/// Horizontal vector math instructions may be slower than normal math with
20201/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20202/// implementation, and likely shuffle complexity of the alternate sequence.
20203static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20204 const X86Subtarget &Subtarget) {
20205 bool IsOptimizingSize = DAG.shouldOptForSize();
20206 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20207 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20208}
20209
20210/// 64-bit unsigned integer to double expansion.
20212 SelectionDAG &DAG,
20213 const X86Subtarget &Subtarget) {
20214 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20215 // when converting 0 when rounding toward negative infinity. Caller will
20216 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20217 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20218 // This algorithm is not obvious. Here it is what we're trying to output:
20219 /*
20220 movq %rax, %xmm0
20221 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20222 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20223 #ifdef __SSE3__
20224 haddpd %xmm0, %xmm0
20225 #else
20226 pshufd $0x4e, %xmm0, %xmm1
20227 addpd %xmm1, %xmm0
20228 #endif
20229 */
20230
20231 LLVMContext *Context = DAG.getContext();
20232
20233 // Build some magic constants.
20234 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20235 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20236 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20237 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20238
20240 CV1.push_back(
20241 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20242 APInt(64, 0x4330000000000000ULL))));
20243 CV1.push_back(
20244 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20245 APInt(64, 0x4530000000000000ULL))));
20246 Constant *C1 = ConstantVector::get(CV1);
20247 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20248
20249 // Load the 64-bit value into an XMM register.
20250 SDValue XR1 =
20251 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20252 SDValue CLod0 = DAG.getLoad(
20253 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20255 SDValue Unpck1 =
20256 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20257
20258 SDValue CLod1 = DAG.getLoad(
20259 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20261 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20262 // TODO: Are there any fast-math-flags to propagate here?
20263 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20264 SDValue Result;
20265
20266 if (Subtarget.hasSSE3() &&
20267 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20268 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20269 } else {
20270 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20271 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20272 }
20273 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20274 DAG.getVectorIdxConstant(0, dl));
20275 return Result;
20276}
20277
20278/// 32-bit unsigned integer to float expansion.
20280 SelectionDAG &DAG,
20281 const X86Subtarget &Subtarget) {
20282 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20283 // FP constant to bias correct the final result.
20284 SDValue Bias = DAG.getConstantFP(
20285 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20286
20287 // Load the 32-bit value into an XMM register.
20288 SDValue Load =
20289 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20290
20291 // Zero out the upper parts of the register.
20292 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20293
20294 // Or the load with the bias.
20295 SDValue Or = DAG.getNode(
20296 ISD::OR, dl, MVT::v2i64,
20297 DAG.getBitcast(MVT::v2i64, Load),
20298 DAG.getBitcast(MVT::v2i64,
20299 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20300 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20301 DAG.getBitcast(MVT::v2f64, Or),
20302 DAG.getVectorIdxConstant(0, dl));
20303
20304 if (Op.getNode()->isStrictFPOpcode()) {
20305 // Subtract the bias.
20306 // TODO: Are there any fast-math-flags to propagate here?
20307 SDValue Chain = Op.getOperand(0);
20308 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20309 {Chain, Or, Bias});
20310
20311 if (Op.getValueType() == Sub.getValueType())
20312 return Sub;
20313
20314 // Handle final rounding.
20315 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20316 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20317
20318 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20319 }
20320
20321 // Subtract the bias.
20322 // TODO: Are there any fast-math-flags to propagate here?
20323 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20324
20325 // Handle final rounding.
20326 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20327}
20328
20330 SelectionDAG &DAG,
20331 const X86Subtarget &Subtarget) {
20332 if (Op.getSimpleValueType() != MVT::v2f64)
20333 return SDValue();
20334
20335 bool IsStrict = Op->isStrictFPOpcode();
20336
20337 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20338 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20339
20340 if (Subtarget.hasAVX512()) {
20341 if (!Subtarget.hasVLX()) {
20342 // Let generic type legalization widen this.
20343 if (!IsStrict)
20344 return SDValue();
20345 // Otherwise pad the integer input with 0s and widen the operation.
20346 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20347 DAG.getConstant(0, DL, MVT::v2i32));
20348 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20349 {Op.getOperand(0), N0});
20350 SDValue Chain = Res.getValue(1);
20351 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20352 DAG.getVectorIdxConstant(0, DL));
20353 return DAG.getMergeValues({Res, Chain}, DL);
20354 }
20355
20356 // Legalize to v4i32 type.
20357 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20358 DAG.getUNDEF(MVT::v2i32));
20359 if (IsStrict)
20360 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20361 {Op.getOperand(0), N0});
20362 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20363 }
20364
20365 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20366 // This gives us the floating point equivalent of 2^52 + the i32 integer
20367 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20368 // point leaving just our i32 integers in double format.
20369 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20370 SDValue VBias = DAG.getConstantFP(
20371 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20372 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20373 DAG.getBitcast(MVT::v2i64, VBias));
20374 Or = DAG.getBitcast(MVT::v2f64, Or);
20375
20376 if (IsStrict)
20377 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20378 {Op.getOperand(0), Or, VBias});
20379 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20380}
20381
20383 SelectionDAG &DAG,
20384 const X86Subtarget &Subtarget) {
20385 bool IsStrict = Op->isStrictFPOpcode();
20386 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20387 MVT VecIntVT = V.getSimpleValueType();
20388 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20389 "Unsupported custom type");
20390
20391 if (Subtarget.hasAVX512()) {
20392 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20393 assert(!Subtarget.hasVLX() && "Unexpected features");
20394 MVT VT = Op->getSimpleValueType(0);
20395
20396 // v8i32->v8f64 is legal with AVX512 so just return it.
20397 if (VT == MVT::v8f64)
20398 return Op;
20399
20400 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20401 VT == MVT::v8f16) &&
20402 "Unexpected VT!");
20403 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20404 MVT WideIntVT = MVT::v16i32;
20405 if (VT == MVT::v4f64) {
20406 WideVT = MVT::v8f64;
20407 WideIntVT = MVT::v8i32;
20408 }
20409
20410 // Need to concat with zero vector for strict fp to avoid spurious
20411 // exceptions.
20412 SDValue Tmp =
20413 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20414 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20415 DAG.getVectorIdxConstant(0, DL));
20416 SDValue Res, Chain;
20417 if (IsStrict) {
20418 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20419 {Op->getOperand(0), V});
20420 Chain = Res.getValue(1);
20421 } else {
20422 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20423 }
20424
20425 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20426 DAG.getVectorIdxConstant(0, DL));
20427
20428 if (IsStrict)
20429 return DAG.getMergeValues({Res, Chain}, DL);
20430 return Res;
20431 }
20432
20433 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20434 Op->getSimpleValueType(0) == MVT::v4f64) {
20435 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20436 Constant *Bias = ConstantFP::get(
20437 *DAG.getContext(),
20438 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20439 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20440 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20441 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20442 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20443 SDValue VBias = DAG.getMemIntrinsicNode(
20444 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20447
20448 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20449 DAG.getBitcast(MVT::v4i64, VBias));
20450 Or = DAG.getBitcast(MVT::v4f64, Or);
20451
20452 if (IsStrict)
20453 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20454 {Op.getOperand(0), Or, VBias});
20455 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20456 }
20457
20458 // The algorithm is the following:
20459 // #ifdef __SSE4_1__
20460 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20461 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20462 // (uint4) 0x53000000, 0xaa);
20463 // #else
20464 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20465 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20466 // #endif
20467 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20468 // return (float4) lo + fhi;
20469
20470 bool Is128 = VecIntVT == MVT::v4i32;
20471 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20472 // If we convert to something else than the supported type, e.g., to v4f64,
20473 // abort early.
20474 if (VecFloatVT != Op->getSimpleValueType(0))
20475 return SDValue();
20476
20477 // In the #idef/#else code, we have in common:
20478 // - The vector of constants:
20479 // -- 0x4b000000
20480 // -- 0x53000000
20481 // - A shift:
20482 // -- v >> 16
20483
20484 // Create the splat vector for 0x4b000000.
20485 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20486 // Create the splat vector for 0x53000000.
20487 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20488
20489 // Create the right shift.
20490 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20491 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20492
20493 SDValue Low, High;
20494 if (Subtarget.hasSSE41()) {
20495 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20496 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20497 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20498 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20499 // Low will be bitcasted right away, so do not bother bitcasting back to its
20500 // original type.
20501 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20502 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20503 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20504 // (uint4) 0x53000000, 0xaa);
20505 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20506 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20507 // High will be bitcasted right away, so do not bother bitcasting back to
20508 // its original type.
20509 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20510 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20511 } else {
20512 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20513 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20514 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20515 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20516
20517 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20518 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20519 }
20520
20521 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20522 SDValue VecCstFSub = DAG.getConstantFP(
20523 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20524
20525 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20526 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20527 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20528 // enabled. See PR24512.
20529 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20530 // TODO: Are there any fast-math-flags to propagate here?
20531 // (float4) lo;
20532 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20533 // return (float4) lo + fhi;
20534 if (IsStrict) {
20535 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20536 {Op.getOperand(0), HighBitcast, VecCstFSub});
20537 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20538 {FHigh.getValue(1), LowBitcast, FHigh});
20539 }
20540
20541 SDValue FHigh =
20542 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20543 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20544}
20545
20547 const X86Subtarget &Subtarget) {
20548 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20549 SDValue N0 = Op.getOperand(OpNo);
20550 MVT SrcVT = N0.getSimpleValueType();
20551
20552 switch (SrcVT.SimpleTy) {
20553 default:
20554 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20555 case MVT::v2i32:
20556 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20557 case MVT::v4i32:
20558 case MVT::v8i32:
20559 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20560 case MVT::v2i64:
20561 case MVT::v4i64:
20562 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20563 }
20564}
20565
20566SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20567 SelectionDAG &DAG) const {
20568 bool IsStrict = Op->isStrictFPOpcode();
20569 unsigned OpNo = IsStrict ? 1 : 0;
20570 SDValue Src = Op.getOperand(OpNo);
20571 SDLoc dl(Op);
20572 auto PtrVT = getPointerTy(DAG.getDataLayout());
20573 MVT SrcVT = Src.getSimpleValueType();
20574 MVT DstVT = Op->getSimpleValueType(0);
20575 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20576
20577 // Bail out when we don't have native conversion instructions.
20578 if (DstVT == MVT::f128)
20579 return SDValue();
20580
20581 if (isSoftF16(DstVT, Subtarget))
20582 return promoteXINT_TO_FP(Op, dl, DAG);
20583 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20584 return Op;
20585
20586 if (DstVT.isVector())
20587 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20588
20589 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20590 return LowerWin64_INT128_TO_FP(Op, DAG);
20591
20592 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20593 return Extract;
20594
20595 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20596 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20597 // Conversions from unsigned i32 to f32/f64 are legal,
20598 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20599 return Op;
20600 }
20601
20602 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20603 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20604 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20605 if (IsStrict)
20606 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20607 {Chain, Src});
20608 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20609 }
20610
20611 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20612 return V;
20613 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20614 return V;
20615
20616 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20617 // infinity. It produces -0.0, so disable under strictfp.
20618 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20619 !IsStrict)
20620 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20621 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20622 // negative infinity. So disable under strictfp. Using FILD instead.
20623 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20624 !IsStrict)
20625 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20626 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20627 (DstVT == MVT::f32 || DstVT == MVT::f64))
20628 return SDValue();
20629
20630 // Make a 64-bit buffer, and use it to build an FILD.
20631 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20632 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20633 Align SlotAlign(8);
20634 MachinePointerInfo MPI =
20636 if (SrcVT == MVT::i32) {
20637 SDValue OffsetSlot =
20638 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20639 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20640 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20641 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20642 std::pair<SDValue, SDValue> Tmp =
20643 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20644 if (IsStrict)
20645 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20646
20647 return Tmp.first;
20648 }
20649
20650 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20651 SDValue ValueToStore = Src;
20652 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20653 // Bitcasting to f64 here allows us to do a single 64-bit store from
20654 // an SSE register, avoiding the store forwarding penalty that would come
20655 // with two 32-bit stores.
20656 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20657 }
20658 SDValue Store =
20659 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20660 // For i64 source, we need to add the appropriate power of 2 if the input
20661 // was negative. We must be careful to do the computation in x87 extended
20662 // precision, not in SSE.
20663 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20664 SDValue Ops[] = {Store, StackSlot};
20665 SDValue Fild =
20666 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20667 SlotAlign, MachineMemOperand::MOLoad);
20668 Chain = Fild.getValue(1);
20669
20670 // Check whether the sign bit is set.
20671 SDValue SignSet = DAG.getSetCC(
20672 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20673 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20674
20675 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20676 APInt FF(64, 0x5F80000000000000ULL);
20677 SDValue FudgePtr =
20678 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20679 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20680
20681 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20682 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20683 SDValue Four = DAG.getIntPtrConstant(4, dl);
20684 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20685 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20686
20687 // Load the value out, extending it from f32 to f80.
20688 SDValue Fudge = DAG.getExtLoad(
20689 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20691 CPAlignment);
20692 Chain = Fudge.getValue(1);
20693 // Extend everything to 80 bits to force it to be done on x87.
20694 // TODO: Are there any fast-math-flags to propagate here?
20695 if (IsStrict) {
20696 unsigned Opc = ISD::STRICT_FADD;
20697 // Windows needs the precision control changed to 80bits around this add.
20698 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20700
20701 SDValue Add =
20702 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20703 // STRICT_FP_ROUND can't handle equal types.
20704 if (DstVT == MVT::f80)
20705 return Add;
20706 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20707 {Add.getValue(1), Add,
20708 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20709 }
20710 unsigned Opc = ISD::FADD;
20711 // Windows needs the precision control changed to 80bits around this add.
20712 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20714
20715 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20716 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20717 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20718}
20719
20720// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20721// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20722// just return an SDValue().
20723// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20724// to i16, i32 or i64, and we lower it to a legal sequence and return the
20725// result.
20726SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20727 bool IsSigned,
20728 SDValue &Chain) const {
20729 bool IsStrict = Op->isStrictFPOpcode();
20730 SDLoc DL(Op);
20731
20732 EVT DstTy = Op.getValueType();
20733 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20734 EVT TheVT = Value.getValueType();
20735 auto PtrVT = getPointerTy(DAG.getDataLayout());
20736
20737 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20738 // f16 must be promoted before using the lowering in this routine.
20739 // fp128 does not use this lowering.
20740 return SDValue();
20741 }
20742
20743 // If using FIST to compute an unsigned i64, we'll need some fixup
20744 // to handle values above the maximum signed i64. A FIST is always
20745 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20746 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20747
20748 // FIXME: This does not generate an invalid exception if the input does not
20749 // fit in i32. PR44019
20750 if (!IsSigned && DstTy != MVT::i64) {
20751 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20752 // The low 32 bits of the fist result will have the correct uint32 result.
20753 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20754 DstTy = MVT::i64;
20755 }
20756
20757 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20758 DstTy.getSimpleVT() >= MVT::i16 &&
20759 "Unknown FP_TO_INT to lower!");
20760
20761 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20762 // stack slot.
20763 MachineFunction &MF = DAG.getMachineFunction();
20764 unsigned MemSize = DstTy.getStoreSize();
20765 int SSFI =
20766 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20767 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20768
20769 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20770
20771 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20772
20773 if (UnsignedFixup) {
20774 //
20775 // Conversion to unsigned i64 is implemented with a select,
20776 // depending on whether the source value fits in the range
20777 // of a signed i64. Let Thresh be the FP equivalent of
20778 // 0x8000000000000000ULL.
20779 //
20780 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20781 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20782 // FistSrc = (Value - FltOfs);
20783 // Fist-to-mem64 FistSrc
20784 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20785 // to XOR'ing the high 32 bits with Adjust.
20786 //
20787 // Being a power of 2, Thresh is exactly representable in all FP formats.
20788 // For X87 we'd like to use the smallest FP type for this constant, but
20789 // for DAG type consistency we have to match the FP operand type.
20790
20791 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20793 bool LosesInfo = false;
20794 if (TheVT == MVT::f64)
20795 // The rounding mode is irrelevant as the conversion should be exact.
20796 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20797 &LosesInfo);
20798 else if (TheVT == MVT::f80)
20799 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20800 APFloat::rmNearestTiesToEven, &LosesInfo);
20801
20802 assert(Status == APFloat::opOK && !LosesInfo &&
20803 "FP conversion should have been exact");
20804
20805 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20806
20807 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20808 *DAG.getContext(), TheVT);
20809 SDValue Cmp;
20810 if (IsStrict) {
20811 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20812 /*IsSignaling*/ true);
20813 Chain = Cmp.getValue(1);
20814 } else {
20815 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20816 }
20817
20818 // Our preferred lowering of
20819 //
20820 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20821 //
20822 // is
20823 //
20824 // (Value >= Thresh) << 63
20825 //
20826 // but since we can get here after LegalOperations, DAGCombine might do the
20827 // wrong thing if we create a select. So, directly create the preferred
20828 // version.
20829 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20830 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20831 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20832
20833 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20834 DAG.getConstantFP(0.0, DL, TheVT));
20835
20836 if (IsStrict) {
20837 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20838 { Chain, Value, FltOfs });
20839 Chain = Value.getValue(1);
20840 } else
20841 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20842 }
20843
20844 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20845
20846 // FIXME This causes a redundant load/store if the SSE-class value is already
20847 // in memory, such as if it is on the callstack.
20848 if (isScalarFPTypeInSSEReg(TheVT)) {
20849 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20850 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20851 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20852 SDValue Ops[] = { Chain, StackSlot };
20853
20854 unsigned FLDSize = TheVT.getStoreSize();
20855 assert(FLDSize <= MemSize && "Stack slot not big enough");
20856 MachineMemOperand *MMO = MF.getMachineMemOperand(
20857 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20858 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20859 Chain = Value.getValue(1);
20860 }
20861
20862 // Build the FP_TO_INT*_IN_MEM
20863 MachineMemOperand *MMO = MF.getMachineMemOperand(
20864 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20865 SDValue Ops[] = { Chain, Value, StackSlot };
20867 DAG.getVTList(MVT::Other),
20868 Ops, DstTy, MMO);
20869
20870 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20871 Chain = Res.getValue(1);
20872
20873 // If we need an unsigned fixup, XOR the result with adjust.
20874 if (UnsignedFixup)
20875 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20876
20877 return Res;
20878}
20879
20881 const X86Subtarget &Subtarget) {
20882 MVT VT = Op.getSimpleValueType();
20883 SDValue In = Op.getOperand(0);
20884 MVT InVT = In.getSimpleValueType();
20885 unsigned Opc = Op.getOpcode();
20886
20887 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20889 "Unexpected extension opcode");
20891 "Expected same number of elements");
20892 assert((VT.getVectorElementType() == MVT::i16 ||
20893 VT.getVectorElementType() == MVT::i32 ||
20894 VT.getVectorElementType() == MVT::i64) &&
20895 "Unexpected element type");
20896 assert((InVT.getVectorElementType() == MVT::i8 ||
20897 InVT.getVectorElementType() == MVT::i16 ||
20898 InVT.getVectorElementType() == MVT::i32) &&
20899 "Unexpected element type");
20900
20901 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20902
20903 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20904 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20905 return splitVectorIntUnary(Op, DAG, dl);
20906 }
20907
20908 if (Subtarget.hasInt256())
20909 return Op;
20910
20911 // Optimize vectors in AVX mode:
20912 //
20913 // v8i16 -> v8i32
20914 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20915 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20916 // Concat upper and lower parts.
20917 //
20918 // v4i32 -> v4i64
20919 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20920 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20921 // Concat upper and lower parts.
20922 //
20923 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20924 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20925
20926 // Short-circuit if we can determine that each 128-bit half is the same value.
20927 // Otherwise, this is difficult to match and optimize.
20928 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20929 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20930 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20931
20932 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20933 SDValue Undef = DAG.getUNDEF(InVT);
20934 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20935 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20936 OpHi = DAG.getBitcast(HalfVT, OpHi);
20937
20938 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20939}
20940
20941// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20942static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20943 const SDLoc &dl, SelectionDAG &DAG) {
20944 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20945 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20946 DAG.getVectorIdxConstant(0, dl));
20947 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20948 DAG.getVectorIdxConstant(8, dl));
20949 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20950 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20951 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20952 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20953}
20954
20956 const X86Subtarget &Subtarget,
20957 SelectionDAG &DAG) {
20958 MVT VT = Op->getSimpleValueType(0);
20959 SDValue In = Op->getOperand(0);
20960 MVT InVT = In.getSimpleValueType();
20961 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20962 unsigned NumElts = VT.getVectorNumElements();
20963
20964 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20965 // avoids a constant pool load.
20966 if (VT.getVectorElementType() != MVT::i8) {
20967 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20968 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20969 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20970 }
20971
20972 // Extend VT if BWI is not supported.
20973 MVT ExtVT = VT;
20974 if (!Subtarget.hasBWI()) {
20975 // If v16i32 is to be avoided, we'll need to split and concatenate.
20976 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20977 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20978
20979 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20980 }
20981
20982 // Widen to 512-bits if VLX is not supported.
20983 MVT WideVT = ExtVT;
20984 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20985 NumElts *= 512 / ExtVT.getSizeInBits();
20986 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20987 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20988 DAG.getVectorIdxConstant(0, DL));
20989 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20990 }
20991
20992 SDValue One = DAG.getConstant(1, DL, WideVT);
20993 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20994
20995 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20996
20997 // Truncate if we had to extend above.
20998 if (VT != ExtVT) {
20999 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21000 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21001 }
21002
21003 // Extract back to 128/256-bit if we widened.
21004 if (WideVT != VT)
21005 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21006 DAG.getVectorIdxConstant(0, DL));
21007
21008 return SelectedVal;
21009}
21010
21012 SelectionDAG &DAG) {
21013 SDValue In = Op.getOperand(0);
21014 MVT SVT = In.getSimpleValueType();
21015 SDLoc DL(Op);
21016
21017 if (SVT.getVectorElementType() == MVT::i1)
21018 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21019
21020 assert(Subtarget.hasAVX() && "Expected AVX support");
21021 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21022}
21023
21024/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21025/// It makes use of the fact that vectors with enough leading sign/zero bits
21026/// prevent the PACKSS/PACKUS from saturating the results.
21027/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21028/// within each 128-bit lane.
21029static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21030 const SDLoc &DL, SelectionDAG &DAG,
21031 const X86Subtarget &Subtarget) {
21032 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21033 "Unexpected PACK opcode");
21034 assert(DstVT.isVector() && "VT not a vector?");
21035
21036 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21037 if (!Subtarget.hasSSE2())
21038 return SDValue();
21039
21040 EVT SrcVT = In.getValueType();
21041
21042 // No truncation required, we might get here due to recursive calls.
21043 if (SrcVT == DstVT)
21044 return In;
21045
21046 unsigned NumElems = SrcVT.getVectorNumElements();
21047 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21048 return SDValue();
21049
21050 unsigned DstSizeInBits = DstVT.getSizeInBits();
21051 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21052 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21053 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21054
21055 LLVMContext &Ctx = *DAG.getContext();
21056 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21057 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21058
21059 // Pack to the largest type possible:
21060 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21061 EVT InVT = MVT::i16, OutVT = MVT::i8;
21062 if (SrcVT.getScalarSizeInBits() > 16 &&
21063 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21064 InVT = MVT::i32;
21065 OutVT = MVT::i16;
21066 }
21067
21068 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21069 // On pre-AVX512, pack the src in both halves to help value tracking.
21070 if (SrcSizeInBits <= 128) {
21071 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21072 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21073 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21074 SDValue LHS = DAG.getBitcast(InVT, In);
21075 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21076 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21077 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21078 Res = DAG.getBitcast(PackedVT, Res);
21079 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21080 }
21081
21082 // Split lower/upper subvectors.
21083 SDValue Lo, Hi;
21084 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21085
21086 // If Hi is undef, then don't bother packing it and widen the result instead.
21087 if (Hi.isUndef()) {
21088 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21089 if (SDValue Res =
21090 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21091 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21092 }
21093
21094 unsigned SubSizeInBits = SrcSizeInBits / 2;
21095 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21096 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21097
21098 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21099 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21100 Lo = DAG.getBitcast(InVT, Lo);
21101 Hi = DAG.getBitcast(InVT, Hi);
21102 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21103 return DAG.getBitcast(DstVT, Res);
21104 }
21105
21106 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21107 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21108 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21109 Lo = DAG.getBitcast(InVT, Lo);
21110 Hi = DAG.getBitcast(InVT, Hi);
21111 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21112
21113 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21114 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21115 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21117 int Scale = 64 / OutVT.getScalarSizeInBits();
21118 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21119 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21120
21121 if (DstVT.is256BitVector())
21122 return DAG.getBitcast(DstVT, Res);
21123
21124 // If 512bit -> 128bit truncate another stage.
21125 Res = DAG.getBitcast(PackedVT, Res);
21126 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21127 }
21128
21129 // Recursively pack lower/upper subvectors, concat result and pack again.
21130 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21131
21132 if (PackedVT.is128BitVector()) {
21133 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21134 // type legalization.
21135 SDValue Res =
21136 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21137 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21138 }
21139
21140 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21141 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21142 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21143 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21144 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21145}
21146
21147/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21148/// e.g. trunc <8 x i32> X to <8 x i16> -->
21149/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21150/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21152 const X86Subtarget &Subtarget,
21153 SelectionDAG &DAG) {
21154 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21155 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21156}
21157
21158/// Truncate using inreg sign extension and X86ISD::PACKSS.
21160 const X86Subtarget &Subtarget,
21161 SelectionDAG &DAG) {
21162 EVT SrcVT = In.getValueType();
21163 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21164 DAG.getValueType(DstVT));
21165 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21166}
21167
21168/// Helper to determine if \p In truncated to \p DstVT has the necessary
21169/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21170/// possibly by converting a SRL node to SRA for sign extension.
21171static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21172 SDValue In, const SDLoc &DL,
21173 SelectionDAG &DAG,
21174 const X86Subtarget &Subtarget,
21175 const SDNodeFlags Flags = SDNodeFlags()) {
21176 // Requires SSE2.
21177 if (!Subtarget.hasSSE2())
21178 return SDValue();
21179
21180 EVT SrcVT = In.getValueType();
21181 EVT DstSVT = DstVT.getVectorElementType();
21182 EVT SrcSVT = SrcVT.getVectorElementType();
21183 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21184 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21185
21186 // Check we have a truncation suited for PACKSS/PACKUS.
21187 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21188 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21189 return SDValue();
21190
21191 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21192 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21193
21194 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21195 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21196 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21197 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21198 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21199 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21200 return SDValue();
21201
21202 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21203 // split this for packing.
21204 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21205 !isFreeToSplitVector(In, DAG) &&
21206 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21207 return SDValue();
21208
21209 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21210 if (Subtarget.hasAVX512() && NumStages > 1)
21211 return SDValue();
21212
21213 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21214 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21215
21216 // Truncate with PACKUS if we are truncating a vector with leading zero
21217 // bits that extend all the way to the packed/truncated value.
21218 // e.g. Masks, zext_in_reg, etc.
21219 // Pre-SSE41 we can only use PACKUSWB.
21220 KnownBits Known = DAG.computeKnownBits(In);
21221 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21222 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21223 PackOpcode = X86ISD::PACKUS;
21224 return In;
21225 }
21226
21227 // Truncate with PACKSS if we are truncating a vector with sign-bits
21228 // that extend all the way to the packed/truncated value.
21229 // e.g. Comparison result, sext_in_reg, etc.
21230 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21231
21232 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21233 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21234 // see through BITCASTs later on and combines/simplifications can't then use
21235 // it.
21236 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21237 !Subtarget.hasAVX512())
21238 return SDValue();
21239
21240 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21241 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21242 MinSignBits < NumSignBits) {
21243 PackOpcode = X86ISD::PACKSS;
21244 return In;
21245 }
21246
21247 // If we have a srl that only generates signbits that we will discard in
21248 // the truncation then we can use PACKSS by converting the srl to a sra.
21249 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21250 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21251 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21252 if (*ShAmt == MinSignBits) {
21253 PackOpcode = X86ISD::PACKSS;
21254 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21255 }
21256 }
21257
21258 return SDValue();
21259}
21260
21261/// This function lowers a vector truncation of 'extended sign-bits' or
21262/// 'extended zero-bits' values.
21263/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21265 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21266 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21267 MVT SrcVT = In.getSimpleValueType();
21268 MVT DstSVT = DstVT.getVectorElementType();
21269 MVT SrcSVT = SrcVT.getVectorElementType();
21270 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21271 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21272 return SDValue();
21273
21274 // If the upper half of the source is undef, then attempt to split and
21275 // only truncate the lower half.
21276 if (DstVT.getSizeInBits() >= 128) {
21277 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21278 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21279 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21280 Subtarget, DAG))
21281 return widenSubVector(Res, false, Subtarget, DAG, DL,
21282 DstVT.getSizeInBits());
21283 }
21284 }
21285
21286 unsigned PackOpcode;
21287 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21288 Subtarget, Flags))
21289 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21290
21291 return SDValue();
21292}
21293
21294/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21295/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21297 const X86Subtarget &Subtarget,
21298 SelectionDAG &DAG) {
21299 MVT SrcVT = In.getSimpleValueType();
21300 MVT DstSVT = DstVT.getVectorElementType();
21301 MVT SrcSVT = SrcVT.getVectorElementType();
21302 unsigned NumElems = DstVT.getVectorNumElements();
21303 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21304 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21305 NumElems >= 8))
21306 return SDValue();
21307
21308 // SSSE3's pshufb results in less instructions in the cases below.
21309 if (Subtarget.hasSSSE3() && NumElems == 8) {
21310 if (SrcSVT == MVT::i16)
21311 return SDValue();
21312 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21313 return SDValue();
21314 }
21315
21316 // If the upper half of the source is undef, then attempt to split and
21317 // only truncate the lower half.
21318 if (DstVT.getSizeInBits() >= 128) {
21319 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21320 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21321 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21322 return widenSubVector(Res, false, Subtarget, DAG, DL,
21323 DstVT.getSizeInBits());
21324 }
21325 }
21326
21327 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21328 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21329 // truncate 2 x v4i32 to v8i16.
21330 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21331 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21332
21333 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21334 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21335
21336 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21337 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21338 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21339 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21340 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21341 }
21342
21343 return SDValue();
21344}
21345
21347 SelectionDAG &DAG,
21348 const X86Subtarget &Subtarget) {
21349 MVT VT = Op.getSimpleValueType();
21350 SDValue In = Op.getOperand(0);
21351 MVT InVT = In.getSimpleValueType();
21352 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21353
21354 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21355 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21356 if (InVT.getScalarSizeInBits() <= 16) {
21357 if (Subtarget.hasBWI()) {
21358 // legal, will go to VPMOVB2M, VPMOVW2M
21359 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21360 // We need to shift to get the lsb into sign position.
21361 // Shift packed bytes not supported natively, bitcast to word
21362 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21363 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21364 DAG.getBitcast(ExtVT, In),
21365 DAG.getConstant(ShiftInx, DL, ExtVT));
21366 In = DAG.getBitcast(InVT, In);
21367 }
21368 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21369 In, ISD::SETGT);
21370 }
21371 // Use TESTD/Q, extended vector to packed dword/qword.
21372 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21373 "Unexpected vector type.");
21374 unsigned NumElts = InVT.getVectorNumElements();
21375 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21376 // We need to change to a wider element type that we have support for.
21377 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21378 // For 16 element vectors we extend to v16i32 unless we are explicitly
21379 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21380 // we need to split into two 8 element vectors which we can extend to v8i32,
21381 // truncate and concat the results. There's an additional complication if
21382 // the original type is v16i8. In that case we can't split the v16i8
21383 // directly, so we need to shuffle high elements to low and use
21384 // sign_extend_vector_inreg.
21385 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21386 SDValue Lo, Hi;
21387 if (InVT == MVT::v16i8) {
21388 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21389 Hi = DAG.getVectorShuffle(
21390 InVT, DL, In, In,
21391 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21392 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21393 } else {
21394 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21395 Lo = extract128BitVector(In, 0, DAG, DL);
21396 Hi = extract128BitVector(In, 8, DAG, DL);
21397 }
21398 // We're split now, just emit two truncates and a concat. The two
21399 // truncates will trigger legalization to come back to this function.
21400 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21401 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21402 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21403 }
21404 // We either have 8 elements or we're allowed to use 512-bit vectors.
21405 // If we have VLX, we want to use the narrowest vector that can get the
21406 // job done so we use vXi32.
21407 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21408 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21409 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21410 InVT = ExtVT;
21411 ShiftInx = InVT.getScalarSizeInBits() - 1;
21412 }
21413
21414 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21415 // We need to shift to get the lsb into sign position.
21416 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21417 DAG.getConstant(ShiftInx, DL, InVT));
21418 }
21419 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21420 if (Subtarget.hasDQI())
21421 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21422 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21423}
21424
21425SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21426 SDLoc DL(Op);
21427 MVT VT = Op.getSimpleValueType();
21428 SDValue In = Op.getOperand(0);
21429 MVT InVT = In.getSimpleValueType();
21431 "Invalid TRUNCATE operation");
21432
21433 // If we're called by the type legalizer, handle a few cases.
21434 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21435 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21436 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21437 VT.is128BitVector() && Subtarget.hasAVX512()) {
21438 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21439 "Unexpected subtarget!");
21440 // The default behavior is to truncate one step, concatenate, and then
21441 // truncate the remainder. We'd rather produce two 64-bit results and
21442 // concatenate those.
21443 SDValue Lo, Hi;
21444 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21445
21446 EVT LoVT, HiVT;
21447 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21448
21449 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21450 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21451 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21452 }
21453
21454 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21455 if (!Subtarget.hasAVX512() ||
21456 (InVT.is512BitVector() && VT.is256BitVector()))
21458 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21459 return SignPack;
21460
21461 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21462 if (!Subtarget.hasAVX512())
21463 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21464
21465 // Otherwise let default legalization handle it.
21466 return SDValue();
21467 }
21468
21469 if (VT.getVectorElementType() == MVT::i1)
21470 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21471
21472 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21473 // concat from subvectors to use VPTRUNC etc.
21474 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21476 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21477 return SignPack;
21478
21479 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21480 if (Subtarget.hasAVX512()) {
21481 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21482 assert(VT == MVT::v32i8 && "Unexpected VT!");
21483 return splitVectorIntUnary(Op, DAG, DL);
21484 }
21485
21486 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21487 // and then truncate that. But we should only do that if we haven't been
21488 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21489 // handled by isel patterns.
21490 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21491 Subtarget.canExtendTo512DQ())
21492 return Op;
21493 }
21494
21495 // Handle truncation of V256 to V128 using shuffles.
21496 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21497
21498 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21499 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21500 if (Subtarget.hasInt256()) {
21501 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21502 In = DAG.getBitcast(MVT::v8i32, In);
21503 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21504 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21505 DAG.getVectorIdxConstant(0, DL));
21506 }
21507
21508 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21509 DAG.getVectorIdxConstant(0, DL));
21510 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21511 DAG.getVectorIdxConstant(2, DL));
21512 static const int ShufMask[] = {0, 2, 4, 6};
21513 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21514 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21515 }
21516
21517 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21518 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21519 if (Subtarget.hasInt256()) {
21520 // The PSHUFB mask:
21521 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21522 -1, -1, -1, -1, -1, -1, -1, -1,
21523 16, 17, 20, 21, 24, 25, 28, 29,
21524 -1, -1, -1, -1, -1, -1, -1, -1 };
21525 In = DAG.getBitcast(MVT::v32i8, In);
21526 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21527 In = DAG.getBitcast(MVT::v4i64, In);
21528
21529 static const int ShufMask2[] = {0, 2, -1, -1};
21530 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21531 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21532 DAG.getVectorIdxConstant(0, DL));
21533 return DAG.getBitcast(MVT::v8i16, In);
21534 }
21535
21536 return Subtarget.hasSSE41()
21537 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21538 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21539 }
21540
21541 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21542 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21543
21544 llvm_unreachable("All 256->128 cases should have been handled above!");
21545}
21546
21547// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21548// behaves on out of range inputs to generate optimized conversions.
21550 SelectionDAG &DAG,
21551 const X86Subtarget &Subtarget) {
21552 MVT SrcVT = Src.getSimpleValueType();
21553 unsigned DstBits = VT.getScalarSizeInBits();
21554 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21555
21556 // Calculate the converted result for values in the range 0 to
21557 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21558 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21559 SDValue Big =
21560 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21561 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21562 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21563
21564 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21565 // and only if the value was out of range. So we can use that
21566 // as our indicator that we rather use "Big" instead of "Small".
21567 //
21568 // Use "Small" if "IsOverflown" has all bits cleared
21569 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21570
21571 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21572 // use the slightly slower blendv select instead.
21573 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21574 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21575 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21576 }
21577
21578 SDValue IsOverflown =
21579 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21580 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21581 return DAG.getNode(ISD::OR, dl, VT, Small,
21582 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21583}
21584
21585SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21586 bool IsStrict = Op->isStrictFPOpcode();
21587 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21588 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21589 bool HasVLX = Subtarget.hasVLX();
21590 MVT VT = Op->getSimpleValueType(0);
21591 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21592 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21593 MVT SrcVT = Src.getSimpleValueType();
21594 SDLoc dl(Op);
21595
21596 SDValue Res;
21597 if (isSoftF16(SrcVT, Subtarget)) {
21598 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21599 if (IsStrict)
21600 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21601 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21602 {NVT, MVT::Other}, {Chain, Src})});
21603 return DAG.getNode(Op.getOpcode(), dl, VT,
21604 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21605 } else if (isTypeLegal(SrcVT) &&
21606 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21607 return Op;
21608 }
21609
21610 if (VT.isVector()) {
21611 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21612 MVT ResVT = MVT::v4i32;
21613 MVT TruncVT = MVT::v4i1;
21614 unsigned Opc;
21615 if (IsStrict)
21617 else
21618 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21619
21620 if (!IsSigned && !HasVLX) {
21621 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21622 // Widen to 512-bits.
21623 ResVT = MVT::v8i32;
21624 TruncVT = MVT::v8i1;
21625 Opc = Op.getOpcode();
21626 // Need to concat with zero vector for strict fp to avoid spurious
21627 // exceptions.
21628 // TODO: Should we just do this for non-strict as well?
21629 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21630 : DAG.getUNDEF(MVT::v8f64);
21631 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21632 DAG.getVectorIdxConstant(0, dl));
21633 }
21634 if (IsStrict) {
21635 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21636 Chain = Res.getValue(1);
21637 } else {
21638 Res = DAG.getNode(Opc, dl, ResVT, Src);
21639 }
21640
21641 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21642 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21643 DAG.getVectorIdxConstant(0, dl));
21644 if (IsStrict)
21645 return DAG.getMergeValues({Res, Chain}, dl);
21646 return Res;
21647 }
21648
21649 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21650 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21651 VT == MVT::v32i16)
21652 return Op;
21653
21654 MVT ResVT = VT;
21655 MVT EleVT = VT.getVectorElementType();
21656 if (EleVT != MVT::i64)
21657 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21658
21659 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21660 SDValue Tmp =
21661 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21662 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21663 Ops[0] = Src;
21664 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21665 }
21666
21667 if (!HasVLX) {
21668 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21669 // Widen to 512-bits.
21670 unsigned IntSize = EleVT.getSizeInBits();
21671 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21672 ResVT = MVT::getVectorVT(EleVT, Num);
21673 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21674 Subtarget, DAG, dl);
21675 }
21676
21677 if (IsStrict) {
21678 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21680 dl, {ResVT, MVT::Other}, {Chain, Src});
21681 Chain = Res.getValue(1);
21682 } else {
21683 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21684 ResVT, Src);
21685 }
21686
21687 // TODO: Need to add exception check code for strict FP.
21688 if (EleVT.getSizeInBits() < 16) {
21689 if (HasVLX)
21690 ResVT = MVT::getVectorVT(EleVT, 8);
21691 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21692 }
21693
21694 if (ResVT != VT)
21695 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21696 DAG.getVectorIdxConstant(0, dl));
21697
21698 if (IsStrict)
21699 return DAG.getMergeValues({Res, Chain}, dl);
21700 return Res;
21701 }
21702
21703 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21704 if (VT.getVectorElementType() == MVT::i16) {
21705 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21706 SrcVT.getVectorElementType() == MVT::f64) &&
21707 "Expected f32/f64 vector!");
21708 MVT NVT = VT.changeVectorElementType(MVT::i32);
21709 if (IsStrict) {
21710 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21712 dl, {NVT, MVT::Other}, {Chain, Src});
21713 Chain = Res.getValue(1);
21714 } else {
21715 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21716 NVT, Src);
21717 }
21718
21719 // TODO: Need to add exception check code for strict FP.
21720 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21721
21722 if (IsStrict)
21723 return DAG.getMergeValues({Res, Chain}, dl);
21724 return Res;
21725 }
21726
21727 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21728 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21729 assert(!IsSigned && "Expected unsigned conversion!");
21730 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21731 return Op;
21732 }
21733
21734 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21735 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21736 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21737 Subtarget.useAVX512Regs()) {
21738 assert(!IsSigned && "Expected unsigned conversion!");
21739 assert(!Subtarget.hasVLX() && "Unexpected features!");
21740 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21741 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21742 // Need to concat with zero vector for strict fp to avoid spurious
21743 // exceptions.
21744 // TODO: Should we just do this for non-strict as well?
21745 SDValue Tmp =
21746 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21747 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21748 DAG.getVectorIdxConstant(0, dl));
21749
21750 if (IsStrict) {
21751 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21752 {Chain, Src});
21753 Chain = Res.getValue(1);
21754 } else {
21755 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21756 }
21757
21758 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21759 DAG.getVectorIdxConstant(0, dl));
21760
21761 if (IsStrict)
21762 return DAG.getMergeValues({Res, Chain}, dl);
21763 return Res;
21764 }
21765
21766 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21767 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21768 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21769 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21770 assert(!Subtarget.hasVLX() && "Unexpected features!");
21771 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21772 // Need to concat with zero vector for strict fp to avoid spurious
21773 // exceptions.
21774 // TODO: Should we just do this for non-strict as well?
21775 SDValue Tmp =
21776 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21777 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21778 DAG.getVectorIdxConstant(0, dl));
21779
21780 if (IsStrict) {
21781 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21782 {Chain, Src});
21783 Chain = Res.getValue(1);
21784 } else {
21785 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21786 }
21787
21788 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21789 DAG.getVectorIdxConstant(0, dl));
21790
21791 if (IsStrict)
21792 return DAG.getMergeValues({Res, Chain}, dl);
21793 return Res;
21794 }
21795
21796 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21797 if (!Subtarget.hasVLX()) {
21798 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21799 // legalizer and then widened again by vector op legalization.
21800 if (!IsStrict)
21801 return SDValue();
21802
21803 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21804 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21805 {Src, Zero, Zero, Zero});
21806 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21807 {Chain, Tmp});
21808 SDValue Chain = Tmp.getValue(1);
21809 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21810 DAG.getVectorIdxConstant(0, dl));
21811 return DAG.getMergeValues({Tmp, Chain}, dl);
21812 }
21813
21814 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21815 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21816 DAG.getUNDEF(MVT::v2f32));
21817 if (IsStrict) {
21818 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21820 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21821 }
21822 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21823 return DAG.getNode(Opc, dl, VT, Tmp);
21824 }
21825
21826 // Generate optimized instructions for pre AVX512 unsigned conversions from
21827 // vXf32 to vXi32.
21828 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21829 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21830 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21831 assert(!IsSigned && "Expected unsigned conversion!");
21832 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21833 }
21834
21835 return SDValue();
21836 }
21837
21838 assert(!VT.isVector());
21839
21840 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21841
21842 if (!IsSigned && UseSSEReg) {
21843 // Conversions from f32/f64 with AVX512 should be legal.
21844 if (Subtarget.hasAVX512())
21845 return Op;
21846
21847 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21848 // behaves on out of range inputs to generate optimized conversions.
21849 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21850 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21851 unsigned DstBits = VT.getScalarSizeInBits();
21852 APInt UIntLimit = APInt::getSignMask(DstBits);
21853 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21854 DAG.getConstant(UIntLimit, dl, VT));
21855 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21856
21857 // Calculate the converted result for values in the range:
21858 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21859 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21860 SDValue Small =
21861 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21862 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21863 SDValue Big = DAG.getNode(
21864 X86ISD::CVTTS2SI, dl, VT,
21865 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21866 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21867
21868 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21869 // and only if the value was out of range. So we can use that
21870 // as our indicator that we rather use "Big" instead of "Small".
21871 //
21872 // Use "Small" if "IsOverflown" has all bits cleared
21873 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21874 SDValue IsOverflown = DAG.getNode(
21875 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21876 return DAG.getNode(ISD::OR, dl, VT, Small,
21877 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21878 }
21879
21880 // Use default expansion for i64.
21881 if (VT == MVT::i64)
21882 return SDValue();
21883
21884 assert(VT == MVT::i32 && "Unexpected VT!");
21885
21886 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21887 // FIXME: This does not generate an invalid exception if the input does not
21888 // fit in i32. PR44019
21889 if (Subtarget.is64Bit()) {
21890 if (IsStrict) {
21891 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21892 {Chain, Src});
21893 Chain = Res.getValue(1);
21894 } else
21895 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21896
21897 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21898 if (IsStrict)
21899 return DAG.getMergeValues({Res, Chain}, dl);
21900 return Res;
21901 }
21902
21903 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21904 // use fisttp which will be handled later.
21905 if (!Subtarget.hasSSE3())
21906 return SDValue();
21907 }
21908
21909 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21910 // FIXME: This does not generate an invalid exception if the input does not
21911 // fit in i16. PR44019
21912 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21913 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21914 if (IsStrict) {
21915 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21916 {Chain, Src});
21917 Chain = Res.getValue(1);
21918 } else
21919 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21920
21921 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21922 if (IsStrict)
21923 return DAG.getMergeValues({Res, Chain}, dl);
21924 return Res;
21925 }
21926
21927 // If this is a FP_TO_SINT using SSEReg we're done.
21928 if (UseSSEReg && IsSigned)
21929 return Op;
21930
21931 // fp128 needs to use a libcall.
21932 if (SrcVT == MVT::f128) {
21933 RTLIB::Libcall LC;
21934 if (IsSigned)
21935 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21936 else
21937 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21938
21939 MakeLibCallOptions CallOptions;
21940 std::pair<SDValue, SDValue> Tmp =
21941 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21942
21943 if (IsStrict)
21944 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21945
21946 return Tmp.first;
21947 }
21948
21949 // Fall back to X87.
21950 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21951 if (IsStrict)
21952 return DAG.getMergeValues({V, Chain}, dl);
21953 return V;
21954 }
21955
21956 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21957}
21958
21959SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21960 SelectionDAG &DAG) const {
21961 SDValue Src = Op.getOperand(0);
21962 EVT DstVT = Op.getSimpleValueType();
21963 MVT SrcVT = Src.getSimpleValueType();
21964
21965 if (SrcVT.isVector())
21966 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21967
21968 if (SrcVT == MVT::f16)
21969 return SDValue();
21970
21971 // If the source is in an SSE register, the node is Legal.
21972 if (isScalarFPTypeInSSEReg(SrcVT))
21973 return Op;
21974
21975 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21976}
21977
21978SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21979 SelectionDAG &DAG) const {
21980 EVT DstVT = N->getValueType(0);
21981 SDValue Src = N->getOperand(0);
21982 EVT SrcVT = Src.getValueType();
21983
21984 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21985 // f16 must be promoted before using the lowering in this routine.
21986 // fp128 does not use this lowering.
21987 return SDValue();
21988 }
21989
21990 SDLoc DL(N);
21991 SDValue Chain = DAG.getEntryNode();
21992
21993 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21994
21995 // If we're converting from SSE, the stack slot needs to hold both types.
21996 // Otherwise it only needs to hold the DstVT.
21997 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21998 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21999 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22000 MachinePointerInfo MPI =
22002
22003 if (UseSSE) {
22004 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22005 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22006 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22007 SDValue Ops[] = { Chain, StackPtr };
22008
22009 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22010 /*Align*/ std::nullopt,
22012 Chain = Src.getValue(1);
22013 }
22014
22015 SDValue StoreOps[] = { Chain, Src, StackPtr };
22016 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22017 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22019
22020 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22021}
22022
22023SDValue
22024X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22025 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22026 // but making use of X86 specifics to produce better instruction sequences.
22027 SDNode *Node = Op.getNode();
22028 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22029 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22030 SDLoc dl(SDValue(Node, 0));
22031 SDValue Src = Node->getOperand(0);
22032
22033 // There are three types involved here: SrcVT is the source floating point
22034 // type, DstVT is the type of the result, and TmpVT is the result of the
22035 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22036 // DstVT).
22037 EVT SrcVT = Src.getValueType();
22038 EVT DstVT = Node->getValueType(0);
22039 EVT TmpVT = DstVT;
22040
22041 // This code is only for floats and doubles. Fall back to generic code for
22042 // anything else.
22043 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22044 return SDValue();
22045
22046 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22047 unsigned SatWidth = SatVT.getScalarSizeInBits();
22048 unsigned DstWidth = DstVT.getScalarSizeInBits();
22049 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22050 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22051 "Expected saturation width smaller than result width");
22052
22053 // Promote result of FP_TO_*INT to at least 32 bits.
22054 if (TmpWidth < 32) {
22055 TmpVT = MVT::i32;
22056 TmpWidth = 32;
22057 }
22058
22059 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22060 // us to use a native signed conversion instead.
22061 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22062 TmpVT = MVT::i64;
22063 TmpWidth = 64;
22064 }
22065
22066 // If the saturation width is smaller than the size of the temporary result,
22067 // we can always use signed conversion, which is native.
22068 if (SatWidth < TmpWidth)
22069 FpToIntOpcode = ISD::FP_TO_SINT;
22070
22071 // Determine minimum and maximum integer values and their corresponding
22072 // floating-point values.
22073 APInt MinInt, MaxInt;
22074 if (IsSigned) {
22075 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22076 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22077 } else {
22078 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22079 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22080 }
22081
22082 const fltSemantics &Sem = SrcVT.getFltSemantics();
22083 APFloat MinFloat(Sem);
22084 APFloat MaxFloat(Sem);
22085
22086 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22087 MinInt, IsSigned, APFloat::rmTowardZero);
22088 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22089 MaxInt, IsSigned, APFloat::rmTowardZero);
22090 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22091 && !(MaxStatus & APFloat::opStatus::opInexact);
22092
22093 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22094 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22095
22096 // If the integer bounds are exactly representable as floats, emit a
22097 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22098 if (AreExactFloatBounds) {
22099 if (DstVT != TmpVT) {
22100 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22101 SDValue MinClamped = DAG.getNode(
22102 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22103 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22104 SDValue BothClamped = DAG.getNode(
22105 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22106 // Convert clamped value to integer.
22107 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22108
22109 // NaN will become INDVAL, with the top bit set and the rest zero.
22110 // Truncation will discard the top bit, resulting in zero.
22111 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22112 }
22113
22114 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22115 SDValue MinClamped = DAG.getNode(
22116 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22117 // Clamp by MaxFloat from above. NaN cannot occur.
22118 SDValue BothClamped = DAG.getNode(
22119 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22120 // Convert clamped value to integer.
22121 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22122
22123 if (!IsSigned) {
22124 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22125 // which is zero.
22126 return FpToInt;
22127 }
22128
22129 // Otherwise, select zero if Src is NaN.
22130 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22131 return DAG.getSelectCC(
22132 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22133 }
22134
22135 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22136 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22137
22138 // Result of direct conversion, which may be selected away.
22139 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22140
22141 if (DstVT != TmpVT) {
22142 // NaN will become INDVAL, with the top bit set and the rest zero.
22143 // Truncation will discard the top bit, resulting in zero.
22144 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22145 }
22146
22147 SDValue Select = FpToInt;
22148 // For signed conversions where we saturate to the same size as the
22149 // result type of the fptoi instructions, INDVAL coincides with integer
22150 // minimum, so we don't need to explicitly check it.
22151 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22152 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22153 // MinInt if Src is NaN.
22154 Select = DAG.getSelectCC(
22155 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22156 }
22157
22158 // If Src OGT MaxFloat, select MaxInt.
22159 Select = DAG.getSelectCC(
22160 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22161
22162 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22163 // is already zero. The promoted case was already handled above.
22164 if (!IsSigned || DstVT != TmpVT) {
22165 return Select;
22166 }
22167
22168 // Otherwise, select 0 if Src is NaN.
22169 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22170 return DAG.getSelectCC(
22171 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22172}
22173
22174SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22175 bool IsStrict = Op->isStrictFPOpcode();
22176
22177 SDLoc DL(Op);
22178 MVT VT = Op.getSimpleValueType();
22179 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22180 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22181 MVT SVT = In.getSimpleValueType();
22182
22183 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22184 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22185 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22186 !Subtarget.getTargetTriple().isOSDarwin()))
22187 return SDValue();
22188
22189 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22190 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22191 return Op;
22192
22193 if (SVT == MVT::f16) {
22194 if (Subtarget.hasFP16())
22195 return Op;
22196
22197 if (VT != MVT::f32) {
22198 if (IsStrict)
22199 return DAG.getNode(
22200 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22201 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22202 {MVT::f32, MVT::Other}, {Chain, In})});
22203
22204 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22205 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22206 }
22207
22208 if (!Subtarget.hasF16C()) {
22209 if (!Subtarget.getTargetTriple().isOSDarwin())
22210 return SDValue();
22211
22212 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22213
22214 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22215 TargetLowering::CallLoweringInfo CLI(DAG);
22216 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22217
22218 In = DAG.getBitcast(MVT::i16, In);
22220 TargetLowering::ArgListEntry Entry(
22221 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22222 Entry.IsSExt = false;
22223 Entry.IsZExt = true;
22224 Args.push_back(Entry);
22225
22227 getLibcallName(RTLIB::FPEXT_F16_F32),
22229 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22230 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22231 std::move(Args));
22232
22233 SDValue Res;
22234 std::tie(Res,Chain) = LowerCallTo(CLI);
22235 if (IsStrict)
22236 Res = DAG.getMergeValues({Res, Chain}, DL);
22237
22238 return Res;
22239 }
22240
22241 In = DAG.getBitcast(MVT::i16, In);
22242 SDValue Res;
22243 if (IsStrict) {
22244 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22245 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22246 DAG.getVectorIdxConstant(0, DL));
22247 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22248 {Chain, In});
22249 Chain = Res.getValue(1);
22250 } else {
22251 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22252 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22253 DAG.getUNDEF(MVT::v4i32), In,
22254 DAG.getVectorIdxConstant(0, DL));
22255 In = DAG.getBitcast(MVT::v8i16, In);
22256 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22257 DAG.getTargetConstant(4, DL, MVT::i32));
22258 }
22259 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22260 DAG.getVectorIdxConstant(0, DL));
22261 if (IsStrict)
22262 return DAG.getMergeValues({Res, Chain}, DL);
22263 return Res;
22264 }
22265
22266 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22267 return Op;
22268
22269 if (SVT.getVectorElementType() == MVT::f16) {
22270 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22271 return Op;
22272 assert(Subtarget.hasF16C() && "Unexpected features!");
22273 if (SVT == MVT::v2f16)
22274 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22275 DAG.getUNDEF(MVT::v2f16));
22276 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22277 DAG.getUNDEF(MVT::v4f16));
22278 if (IsStrict)
22279 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22280 {Op->getOperand(0), Res});
22281 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22282 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22283 return Op;
22284 }
22285
22286 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22287
22288 SDValue Res =
22289 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22290 if (IsStrict)
22291 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22292 {Op->getOperand(0), Res});
22293 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22294}
22295
22296SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22297 bool IsStrict = Op->isStrictFPOpcode();
22298
22299 SDLoc DL(Op);
22300 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22301 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22302 MVT VT = Op.getSimpleValueType();
22303 MVT SVT = In.getSimpleValueType();
22304
22305 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22306 return SDValue();
22307
22308 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22309 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22310 if (!Subtarget.getTargetTriple().isOSDarwin())
22311 return SDValue();
22312
22313 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22314 TargetLowering::CallLoweringInfo CLI(DAG);
22315 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22316
22318 TargetLowering::ArgListEntry Entry(
22319 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22320 Entry.IsSExt = false;
22321 Entry.IsZExt = true;
22322 Args.push_back(Entry);
22323
22325 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22326 : RTLIB::FPROUND_F32_F16),
22328 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22329 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22330 std::move(Args));
22331
22332 SDValue Res;
22333 std::tie(Res, Chain) = LowerCallTo(CLI);
22334
22335 Res = DAG.getBitcast(MVT::f16, Res);
22336
22337 if (IsStrict)
22338 Res = DAG.getMergeValues({Res, Chain}, DL);
22339
22340 return Res;
22341 }
22342
22343 if (VT.getScalarType() == MVT::bf16) {
22344 if (SVT.getScalarType() == MVT::f32 &&
22345 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22346 Subtarget.hasAVXNECONVERT()))
22347 return Op;
22348 return SDValue();
22349 }
22350
22351 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22352 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22353 return SDValue();
22354
22355 if (VT.isVector())
22356 return Op;
22357
22358 SDValue Res;
22360 MVT::i32);
22361 if (IsStrict) {
22362 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22363 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22364 DAG.getVectorIdxConstant(0, DL));
22365 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22366 {Chain, Res, Rnd});
22367 Chain = Res.getValue(1);
22368 } else {
22369 // FIXME: Should we use zeros for upper elements for non-strict?
22370 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22371 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22372 }
22373
22374 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22375 DAG.getVectorIdxConstant(0, DL));
22376 Res = DAG.getBitcast(MVT::f16, Res);
22377
22378 if (IsStrict)
22379 return DAG.getMergeValues({Res, Chain}, DL);
22380
22381 return Res;
22382 }
22383
22384 return Op;
22385}
22386
22388 bool IsStrict = Op->isStrictFPOpcode();
22389 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22390 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22391 "Unexpected VT!");
22392
22393 SDLoc dl(Op);
22394 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22395 DAG.getConstant(0, dl, MVT::v8i16), Src,
22396 DAG.getVectorIdxConstant(0, dl));
22397
22398 SDValue Chain;
22399 if (IsStrict) {
22400 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22401 {Op.getOperand(0), Res});
22402 Chain = Res.getValue(1);
22403 } else {
22404 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22405 }
22406
22407 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22408 DAG.getVectorIdxConstant(0, dl));
22409
22410 if (IsStrict)
22411 return DAG.getMergeValues({Res, Chain}, dl);
22412
22413 return Res;
22414}
22415
22417 bool IsStrict = Op->isStrictFPOpcode();
22418 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22419 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22420 "Unexpected VT!");
22421
22422 SDLoc dl(Op);
22423 SDValue Res, Chain;
22424 if (IsStrict) {
22425 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22426 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22427 DAG.getVectorIdxConstant(0, dl));
22428 Res = DAG.getNode(
22429 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22430 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22431 Chain = Res.getValue(1);
22432 } else {
22433 // FIXME: Should we use zeros for upper elements for non-strict?
22434 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22435 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22436 DAG.getTargetConstant(4, dl, MVT::i32));
22437 }
22438
22439 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22440 DAG.getVectorIdxConstant(0, dl));
22441
22442 if (IsStrict)
22443 return DAG.getMergeValues({Res, Chain}, dl);
22444
22445 return Res;
22446}
22447
22448SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22449 SelectionDAG &DAG) const {
22450 SDLoc DL(Op);
22451
22452 MVT SVT = Op.getOperand(0).getSimpleValueType();
22453 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22454 Subtarget.hasAVXNECONVERT())) {
22455 SDValue Res;
22456 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22457 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22458 Res = DAG.getBitcast(MVT::v8i16, Res);
22459 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22460 DAG.getVectorIdxConstant(0, DL));
22461 }
22462
22463 MakeLibCallOptions CallOptions;
22464 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22465 SDValue Res =
22466 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22467 return DAG.getBitcast(MVT::i16, Res);
22468}
22469
22470/// Depending on uarch and/or optimizing for size, we might prefer to use a
22471/// vector operation in place of the typical scalar operation.
22473 SelectionDAG &DAG,
22474 const X86Subtarget &Subtarget) {
22475 // If both operands have other uses, this is probably not profitable.
22476 SDValue LHS = Op.getOperand(0);
22477 SDValue RHS = Op.getOperand(1);
22478 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22479 return Op;
22480
22481 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22482 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22483 if (IsFP && !Subtarget.hasSSE3())
22484 return Op;
22485 if (!IsFP && !Subtarget.hasSSSE3())
22486 return Op;
22487
22488 // Extract from a common vector.
22489 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22490 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22491 LHS.getOperand(0) != RHS.getOperand(0) ||
22492 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22493 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22494 !shouldUseHorizontalOp(true, DAG, Subtarget))
22495 return Op;
22496
22497 // Allow commuted 'hadd' ops.
22498 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22499 unsigned HOpcode;
22500 switch (Op.getOpcode()) {
22501 // clang-format off
22502 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22503 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22504 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22505 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22506 default:
22507 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22508 // clang-format on
22509 }
22510 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22511 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22512 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22513 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22514 std::swap(LExtIndex, RExtIndex);
22515
22516 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22517 return Op;
22518
22519 SDValue X = LHS.getOperand(0);
22520 EVT VecVT = X.getValueType();
22521 unsigned BitWidth = VecVT.getSizeInBits();
22522 unsigned NumLanes = BitWidth / 128;
22523 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22524 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22525 "Not expecting illegal vector widths here");
22526
22527 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22528 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22529 if (BitWidth == 256 || BitWidth == 512) {
22530 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22531 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22532 LExtIndex %= NumEltsPerLane;
22533 }
22534
22535 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22536 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22537 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22538 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22539 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22540 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22541 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22542}
22543
22544/// Depending on uarch and/or optimizing for size, we might prefer to use a
22545/// vector operation in place of the typical scalar operation.
22546SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22547 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22548 "Only expecting float/double");
22549 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22550}
22551
22552/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22553/// This mode isn't supported in hardware on X86. But as long as we aren't
22554/// compiling with trapping math, we can emulate this with
22555/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22557 SDValue N0 = Op.getOperand(0);
22558 SDLoc dl(Op);
22559 MVT VT = Op.getSimpleValueType();
22560
22561 // N0 += copysign(nextafter(0.5, 0.0), N0)
22562 const fltSemantics &Sem = VT.getFltSemantics();
22563 bool Ignored;
22564 APFloat Point5Pred = APFloat(0.5f);
22565 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22566 Point5Pred.next(/*nextDown*/true);
22567
22568 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22569 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22570 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22571
22572 // Truncate the result to remove fraction.
22573 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22574}
22575
22576/// The only differences between FABS and FNEG are the mask and the logic op.
22577/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22579 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22580 "Wrong opcode for lowering FABS or FNEG.");
22581
22582 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22583
22584 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22585 // into an FNABS. We'll lower the FABS after that if it is still in use.
22586 if (IsFABS)
22587 for (SDNode *User : Op->users())
22588 if (User->getOpcode() == ISD::FNEG)
22589 return Op;
22590
22591 SDLoc dl(Op);
22592 MVT VT = Op.getSimpleValueType();
22593
22594 bool IsF128 = (VT == MVT::f128);
22595 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22597 "Unexpected type in LowerFABSorFNEG");
22598
22599 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22600 // decide if we should generate a 16-byte constant mask when we only need 4 or
22601 // 8 bytes for the scalar case.
22602
22603 // There are no scalar bitwise logical SSE/AVX instructions, so we
22604 // generate a 16-byte vector constant and logic op even for the scalar case.
22605 // Using a 16-byte mask allows folding the load of the mask with
22606 // the logic op, so it can save (~4 bytes) on code size.
22607 bool IsFakeVector = !VT.isVector() && !IsF128;
22608 MVT LogicVT = VT;
22609 if (IsFakeVector)
22610 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22611 : (VT == MVT::f32) ? MVT::v4f32
22612 : MVT::v8f16;
22613
22614 unsigned EltBits = VT.getScalarSizeInBits();
22615 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22616 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22617 APInt::getSignMask(EltBits);
22618 const fltSemantics &Sem = VT.getFltSemantics();
22619 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22620
22621 SDValue Op0 = Op.getOperand(0);
22622 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22623 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22624 IsFNABS ? X86ISD::FOR :
22626 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22627
22628 if (VT.isVector() || IsF128)
22629 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22630
22631 // For the scalar case extend to a 128-bit vector, perform the logic op,
22632 // and extract the scalar result back out.
22633 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22634 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22635 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22636 DAG.getVectorIdxConstant(0, dl));
22637}
22638
22640 SDValue Mag = Op.getOperand(0);
22641 SDValue Sign = Op.getOperand(1);
22642 SDLoc dl(Op);
22643
22644 // If the sign operand is smaller, extend it first.
22645 MVT VT = Op.getSimpleValueType();
22646 if (Sign.getSimpleValueType().bitsLT(VT))
22647 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22648
22649 // And if it is bigger, shrink it first.
22650 if (Sign.getSimpleValueType().bitsGT(VT))
22651 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22652 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22653
22654 // At this point the operands and the result should have the same
22655 // type, and that won't be f80 since that is not custom lowered.
22656 bool IsF128 = (VT == MVT::f128);
22657 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22659 "Unexpected type in LowerFCOPYSIGN");
22660
22661 const fltSemantics &Sem = VT.getFltSemantics();
22662
22663 // Perform all scalar logic operations as 16-byte vectors because there are no
22664 // scalar FP logic instructions in SSE.
22665 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22666 // unnecessary splats, but we might miss load folding opportunities. Should
22667 // this decision be based on OptimizeForSize?
22668 bool IsFakeVector = !VT.isVector() && !IsF128;
22669 MVT LogicVT = VT;
22670 if (IsFakeVector)
22671 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22672 : (VT == MVT::f32) ? MVT::v4f32
22673 : MVT::v8f16;
22674
22675 // The mask constants are automatically splatted for vector types.
22676 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22677 SDValue SignMask = DAG.getConstantFP(
22678 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22679 SDValue MagMask = DAG.getConstantFP(
22680 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22681
22682 // First, clear all bits but the sign bit from the second operand (sign).
22683 if (IsFakeVector)
22684 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22685 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22686
22687 // Next, clear the sign bit from the first operand (magnitude).
22688 // TODO: If we had general constant folding for FP logic ops, this check
22689 // wouldn't be necessary.
22690 SDValue MagBits;
22691 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22692 APFloat APF = Op0CN->getValueAPF();
22693 APF.clearSign();
22694 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22695 } else {
22696 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22697 if (IsFakeVector)
22698 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22699 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22700 }
22701
22702 // OR the magnitude value with the sign bit.
22703 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22704 return !IsFakeVector ? Or
22705 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22706 DAG.getVectorIdxConstant(0, dl));
22707}
22708
22710 SDValue N0 = Op.getOperand(0);
22711 SDLoc dl(Op);
22712 MVT VT = Op.getSimpleValueType();
22713
22714 MVT OpVT = N0.getSimpleValueType();
22715 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22716 "Unexpected type for FGETSIGN");
22717
22718 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22719 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22720 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22721 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22722 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22723 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22724 return Res;
22725}
22726
22727/// Helper for attempting to create a X86ISD::BT node.
22728static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22729 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22730 // instruction. Since the shift amount is in-range-or-undefined, we know
22731 // that doing a bittest on the i32 value is ok. We extend to i32 because
22732 // the encoding for the i16 version is larger than the i32 version.
22733 // Also promote i16 to i32 for performance / code size reason.
22734 if (Src.getValueType().getScalarSizeInBits() < 32)
22735 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22736
22737 // No legal type found, give up.
22738 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22739 return SDValue();
22740
22741 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22742 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22743 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22744 // known to be zero.
22745 if (Src.getValueType() == MVT::i64 &&
22746 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22747 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22748
22749 // If the operand types disagree, extend the shift amount to match. Since
22750 // BT ignores high bits (like shifts) we can use anyextend.
22751 if (Src.getValueType() != BitNo.getValueType()) {
22752 // Peek through a mask/modulo operation.
22753 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22754 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22755 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22756 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22757 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22758 BitNo.getOperand(0)),
22759 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22760 BitNo.getOperand(1)));
22761 else
22762 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22763 }
22764
22765 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22766}
22767
22768/// Helper for creating a X86ISD::SETCC node.
22770 SelectionDAG &DAG) {
22771 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22772 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22773}
22774
22775/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22776/// recognizable memcmp expansion.
22777static bool isOrXorXorTree(SDValue X, bool Root = true) {
22778 if (X.getOpcode() == ISD::OR)
22779 return isOrXorXorTree(X.getOperand(0), false) &&
22780 isOrXorXorTree(X.getOperand(1), false);
22781 if (Root)
22782 return false;
22783 return X.getOpcode() == ISD::XOR;
22784}
22785
22786/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22787/// expansion.
22788template <typename F>
22790 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22791 SDValue Op0 = X.getOperand(0);
22792 SDValue Op1 = X.getOperand(1);
22793 if (X.getOpcode() == ISD::OR) {
22794 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22795 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22796 if (VecVT != CmpVT)
22797 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22798 if (HasPT)
22799 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22800 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22801 }
22802 if (X.getOpcode() == ISD::XOR) {
22803 SDValue A = SToV(Op0);
22804 SDValue B = SToV(Op1);
22805 if (VecVT != CmpVT)
22806 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22807 if (HasPT)
22808 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22809 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22810 }
22811 llvm_unreachable("Impossible");
22812}
22813
22814/// Try to map a 128-bit or larger integer comparison to vector instructions
22815/// before type legalization splits it up into chunks.
22817 ISD::CondCode CC,
22818 const SDLoc &DL,
22819 SelectionDAG &DAG,
22820 const X86Subtarget &Subtarget) {
22821 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22822
22823 // We're looking for an oversized integer equality comparison.
22824 EVT OpVT = X.getValueType();
22825 unsigned OpSize = OpVT.getSizeInBits();
22826 if (!OpVT.isScalarInteger() || OpSize < 128)
22827 return SDValue();
22828
22829 // Ignore a comparison with zero because that gets special treatment in
22830 // EmitTest(). But make an exception for the special case of a pair of
22831 // logically-combined vector-sized operands compared to zero. This pattern may
22832 // be generated by the memcmp expansion pass with oversized integer compares
22833 // (see PR33325).
22834 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22835 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22836 return SDValue();
22837
22838 // Don't perform this combine if constructing the vector will be expensive.
22839 auto IsVectorBitCastCheap = [](SDValue X) {
22841 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22842 X.getOpcode() == ISD::LOAD;
22843 };
22844 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22845 !IsOrXorXorTreeCCZero)
22846 return SDValue();
22847
22848 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22849 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22850 // Otherwise use PCMPEQ (plus AND) and mask testing.
22851 bool NoImplicitFloatOps =
22853 Attribute::NoImplicitFloat);
22854 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22855 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22856 (OpSize == 256 && Subtarget.hasAVX()) ||
22857 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22858 bool HasPT = Subtarget.hasSSE41();
22859
22860 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22861 // vector registers are essentially free. (Technically, widening registers
22862 // prevents load folding, but the tradeoff is worth it.)
22863 bool PreferKOT = Subtarget.preferMaskRegisters();
22864 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22865
22866 EVT VecVT = MVT::v16i8;
22867 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22868 if (OpSize == 256) {
22869 VecVT = MVT::v32i8;
22870 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22871 }
22872 EVT CastVT = VecVT;
22873 bool NeedsAVX512FCast = false;
22874 if (OpSize == 512 || NeedZExt) {
22875 if (Subtarget.hasBWI()) {
22876 VecVT = MVT::v64i8;
22877 CmpVT = MVT::v64i1;
22878 if (OpSize == 512)
22879 CastVT = VecVT;
22880 } else {
22881 VecVT = MVT::v16i32;
22882 CmpVT = MVT::v16i1;
22883 CastVT = OpSize == 512 ? VecVT
22884 : OpSize == 256 ? MVT::v8i32
22885 : MVT::v4i32;
22886 NeedsAVX512FCast = true;
22887 }
22888 }
22889
22890 auto ScalarToVector = [&](SDValue X) -> SDValue {
22891 bool TmpZext = false;
22892 EVT TmpCastVT = CastVT;
22893 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22894 SDValue OrigX = X.getOperand(0);
22895 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22896 if (OrigSize < OpSize) {
22897 if (OrigSize == 128) {
22898 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22899 X = OrigX;
22900 TmpZext = true;
22901 } else if (OrigSize == 256) {
22902 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22903 X = OrigX;
22904 TmpZext = true;
22905 }
22906 }
22907 }
22908 X = DAG.getBitcast(TmpCastVT, X);
22909 if (!NeedZExt && !TmpZext)
22910 return X;
22911 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22912 DAG.getConstant(0, DL, VecVT), X,
22913 DAG.getVectorIdxConstant(0, DL));
22914 };
22915
22916 SDValue Cmp;
22917 if (IsOrXorXorTreeCCZero) {
22918 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22919 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22920 // Use 2 vector equality compares and 'and' the results before doing a
22921 // MOVMSK.
22922 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22923 } else {
22924 SDValue VecX = ScalarToVector(X);
22925 SDValue VecY = ScalarToVector(Y);
22926 if (VecVT != CmpVT) {
22927 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22928 } else if (HasPT) {
22929 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22930 } else {
22931 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22932 }
22933 }
22934 // AVX512 should emit a setcc that will lower to kortest.
22935 if (VecVT != CmpVT) {
22936 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22937 : CmpVT == MVT::v32i1 ? MVT::i32
22938 : MVT::i16;
22939 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22940 DAG.getConstant(0, DL, KRegVT), CC);
22941 }
22942 if (HasPT) {
22943 SDValue BCCmp =
22944 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22945 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22947 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22948 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22949 }
22950 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22951 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22952 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22953 assert(Cmp.getValueType() == MVT::v16i8 &&
22954 "Non 128-bit vector on pre-SSE41 target");
22955 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22956 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22957 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22958 }
22959
22960 return SDValue();
22961}
22962
22963/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22964/// style scalarized (associative) reduction patterns. Partial reductions
22965/// are supported when the pointer SrcMask is non-null.
22966/// TODO - move this to SelectionDAG?
22969 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22971 DenseMap<SDValue, APInt> SrcOpMap;
22972 EVT VT = MVT::Other;
22973
22974 // Recognize a special case where a vector is casted into wide integer to
22975 // test all 0s.
22976 assert(Op.getOpcode() == unsigned(BinOp) &&
22977 "Unexpected bit reduction opcode");
22978 Opnds.push_back(Op.getOperand(0));
22979 Opnds.push_back(Op.getOperand(1));
22980
22981 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22983 // BFS traverse all BinOp operands.
22984 if (I->getOpcode() == unsigned(BinOp)) {
22985 Opnds.push_back(I->getOperand(0));
22986 Opnds.push_back(I->getOperand(1));
22987 // Re-evaluate the number of nodes to be traversed.
22988 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22989 continue;
22990 }
22991
22992 // Quit if a non-EXTRACT_VECTOR_ELT
22993 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22994 return false;
22995
22996 // Quit if without a constant index.
22997 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22998 if (!Idx)
22999 return false;
23000
23001 SDValue Src = I->getOperand(0);
23002 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23003 if (M == SrcOpMap.end()) {
23004 VT = Src.getValueType();
23005 // Quit if not the same type.
23006 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23007 return false;
23008 unsigned NumElts = VT.getVectorNumElements();
23009 APInt EltCount = APInt::getZero(NumElts);
23010 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23011 SrcOps.push_back(Src);
23012 }
23013
23014 // Quit if element already used.
23015 unsigned CIdx = Idx->getZExtValue();
23016 if (M->second[CIdx])
23017 return false;
23018 M->second.setBit(CIdx);
23019 }
23020
23021 if (SrcMask) {
23022 // Collect the source partial masks.
23023 for (SDValue &SrcOp : SrcOps)
23024 SrcMask->push_back(SrcOpMap[SrcOp]);
23025 } else {
23026 // Quit if not all elements are used.
23027 for (const auto &I : SrcOpMap)
23028 if (!I.second.isAllOnes())
23029 return false;
23030 }
23031
23032 return true;
23033}
23034
23035// Helper function for comparing all bits of two vectors.
23037 ISD::CondCode CC, const APInt &OriginalMask,
23038 const X86Subtarget &Subtarget,
23039 SelectionDAG &DAG, X86::CondCode &X86CC) {
23040 EVT VT = LHS.getValueType();
23041 unsigned ScalarSize = VT.getScalarSizeInBits();
23042 if (OriginalMask.getBitWidth() != ScalarSize) {
23043 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23044 return SDValue();
23045 }
23046
23047 // Quit if not convertable to legal scalar or 128/256-bit vector.
23049 return SDValue();
23050
23051 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23052 if (VT.isFloatingPoint())
23053 return SDValue();
23054
23055 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23056 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23057
23058 APInt Mask = OriginalMask;
23059
23060 auto MaskBits = [&](SDValue Src) {
23061 if (Mask.isAllOnes())
23062 return Src;
23063 EVT SrcVT = Src.getValueType();
23064 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23065 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23066 };
23067
23068 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23069 if (VT.getSizeInBits() < 128) {
23070 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23071 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23072 if (IntVT != MVT::i64)
23073 return SDValue();
23074 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23075 MVT::i32, MVT::i32);
23076 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23077 MVT::i32, MVT::i32);
23078 SDValue Lo =
23079 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23080 SDValue Hi =
23081 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23082 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23083 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23084 DAG.getConstant(0, DL, MVT::i32));
23085 }
23086 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23087 DAG.getBitcast(IntVT, MaskBits(LHS)),
23088 DAG.getBitcast(IntVT, MaskBits(RHS)));
23089 }
23090
23091 // Without PTEST, a masked v2i64 or-reduction is not faster than
23092 // scalarization.
23093 bool UseKORTEST = Subtarget.useAVX512Regs();
23094 bool UsePTEST = Subtarget.hasSSE41();
23095 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23096 return SDValue();
23097
23098 // Split down to 128/256/512-bit vector.
23099 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23100
23101 // If the input vector has vector elements wider than the target test size,
23102 // then cast to <X x i64> so it will safely split.
23103 if (ScalarSize > TestSize) {
23104 if (!Mask.isAllOnes())
23105 return SDValue();
23106 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23107 LHS = DAG.getBitcast(VT, LHS);
23108 RHS = DAG.getBitcast(VT, RHS);
23109 Mask = APInt::getAllOnes(64);
23110 }
23111
23112 if (VT.getSizeInBits() > TestSize) {
23113 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23114 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23115 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23116 while (VT.getSizeInBits() > TestSize) {
23117 auto Split = DAG.SplitVector(LHS, DL);
23118 VT = Split.first.getValueType();
23119 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23120 }
23121 RHS = DAG.getAllOnesConstant(DL, VT);
23122 } else if (!UsePTEST && !KnownRHS.isZero()) {
23123 // MOVMSK Special Case:
23124 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23125 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23126 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23127 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23128 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23129 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23130 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23131 V = DAG.getSExtOrTrunc(V, DL, VT);
23132 while (VT.getSizeInBits() > TestSize) {
23133 auto Split = DAG.SplitVector(V, DL);
23134 VT = Split.first.getValueType();
23135 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23136 }
23137 V = DAG.getNOT(DL, V, VT);
23138 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23139 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23140 DAG.getConstant(0, DL, MVT::i32));
23141 } else {
23142 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23143 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23144 while (VT.getSizeInBits() > TestSize) {
23145 auto Split = DAG.SplitVector(V, DL);
23146 VT = Split.first.getValueType();
23147 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23148 }
23149 LHS = V;
23150 RHS = DAG.getConstant(0, DL, VT);
23151 }
23152 }
23153
23154 if (UseKORTEST && VT.is512BitVector()) {
23155 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23156 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23157 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23158 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23159 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23160 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23161 }
23162
23163 if (UsePTEST) {
23164 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23165 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23166 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23167 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23168 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23169 }
23170
23171 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23172 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23173 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23174 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23175 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23176 V = DAG.getNOT(DL, V, MaskVT);
23177 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23178 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23179 DAG.getConstant(0, DL, MVT::i32));
23180}
23181
23182// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23183// to CMP(MOVMSK(PCMPEQB(X,Y))).
23185 ISD::CondCode CC, const SDLoc &DL,
23186 const X86Subtarget &Subtarget,
23187 SelectionDAG &DAG,
23188 X86::CondCode &X86CC) {
23189 SDValue Op = OrigLHS;
23190
23191 bool CmpNull;
23192 APInt Mask;
23193 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23194 CmpNull = isNullConstant(OrigRHS);
23195 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23196 return SDValue();
23197
23198 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23199 return SDValue();
23200
23201 // Check whether we're masking/truncating an OR-reduction result, in which
23202 // case track the masked bits.
23203 // TODO: Add CmpAllOnes support.
23204 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23205 if (CmpNull) {
23206 switch (Op.getOpcode()) {
23207 case ISD::TRUNCATE: {
23208 SDValue Src = Op.getOperand(0);
23209 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23210 Op.getScalarValueSizeInBits());
23211 Op = Src;
23212 break;
23213 }
23214 case ISD::AND: {
23215 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23216 Mask = Cst->getAPIntValue();
23217 Op = Op.getOperand(0);
23218 }
23219 break;
23220 }
23221 }
23222 }
23223 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23224 CC = ISD::SETEQ;
23225 CmpNull = true;
23226 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23227 } else {
23228 return SDValue();
23229 }
23230
23231 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23232
23233 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23234 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23236 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23237 EVT VT = VecIns[0].getValueType();
23238 assert(llvm::all_of(VecIns,
23239 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23240 "Reduction source vector mismatch");
23241
23242 // Quit if not splittable to scalar/128/256/512-bit vector.
23244 return SDValue();
23245
23246 // If more than one full vector is evaluated, AND/OR them first before
23247 // PTEST.
23248 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23249 Slot += 2, e += 1) {
23250 // Each iteration will AND/OR 2 nodes and append the result until there is
23251 // only 1 node left, i.e. the final value of all vectors.
23252 SDValue LHS = VecIns[Slot];
23253 SDValue RHS = VecIns[Slot + 1];
23254 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23255 }
23256
23257 return LowerVectorAllEqual(DL, VecIns.back(),
23258 CmpNull ? DAG.getConstant(0, DL, VT)
23259 : DAG.getAllOnesConstant(DL, VT),
23260 CC, Mask, Subtarget, DAG, X86CC);
23261 }
23262
23263 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23264 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23265 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23266 ISD::NodeType BinOp;
23267 if (SDValue Match =
23268 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23269 EVT MatchVT = Match.getValueType();
23270 return LowerVectorAllEqual(DL, Match,
23271 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23272 : DAG.getAllOnesConstant(DL, MatchVT),
23273 CC, Mask, Subtarget, DAG, X86CC);
23274 }
23275 }
23276
23277 if (Mask.isAllOnes()) {
23278 assert(!Op.getValueType().isVector() &&
23279 "Illegal vector type for reduction pattern");
23281 if (Src.getValueType().isFixedLengthVector() &&
23282 Src.getValueType().getScalarType() == MVT::i1) {
23283 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23284 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23285 if (Src.getOpcode() == ISD::SETCC) {
23286 SDValue LHS = Src.getOperand(0);
23287 SDValue RHS = Src.getOperand(1);
23288 EVT LHSVT = LHS.getValueType();
23289 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23290 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23292 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23293 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23294 X86CC);
23295 }
23296 }
23297 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23298 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23299 // Peek through truncation, mask the LSB and compare against zero/LSB.
23300 if (Src.getOpcode() == ISD::TRUNCATE) {
23301 SDValue Inner = Src.getOperand(0);
23302 EVT InnerVT = Inner.getValueType();
23304 unsigned BW = InnerVT.getScalarSizeInBits();
23305 APInt SrcMask = APInt(BW, 1);
23306 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23307 return LowerVectorAllEqual(DL, Inner,
23308 DAG.getConstant(Cmp, DL, InnerVT), CC,
23309 SrcMask, Subtarget, DAG, X86CC);
23310 }
23311 }
23312 }
23313 }
23314
23315 return SDValue();
23316}
23317
23318/// return true if \c Op has a use that doesn't just read flags.
23320 for (SDUse &Use : Op->uses()) {
23321 SDNode *User = Use.getUser();
23322 unsigned UOpNo = Use.getOperandNo();
23323 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23324 // Look past truncate.
23325 UOpNo = User->use_begin()->getOperandNo();
23326 User = User->use_begin()->getUser();
23327 }
23328
23329 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23330 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23331 return true;
23332 }
23333 return false;
23334}
23335
23336// Transform to an x86-specific ALU node with flags if there is a chance of
23337// using an RMW op or only the flags are used. Otherwise, leave
23338// the node alone and emit a 'cmp' or 'test' instruction.
23340 for (SDNode *U : Op->users())
23341 if (U->getOpcode() != ISD::CopyToReg &&
23342 U->getOpcode() != ISD::SETCC &&
23343 U->getOpcode() != ISD::STORE)
23344 return false;
23345
23346 return true;
23347}
23348
23349/// Emit nodes that will be selected as "test Op0,Op0", or something
23350/// equivalent.
23352 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23353 // CF and OF aren't always set the way we want. Determine which
23354 // of these we need.
23355 bool NeedCF = false;
23356 bool NeedOF = false;
23357 switch (X86CC) {
23358 default: break;
23359 case X86::COND_A: case X86::COND_AE:
23360 case X86::COND_B: case X86::COND_BE:
23361 NeedCF = true;
23362 break;
23363 case X86::COND_G: case X86::COND_GE:
23364 case X86::COND_L: case X86::COND_LE:
23365 case X86::COND_O: case X86::COND_NO: {
23366 // Check if we really need to set the
23367 // Overflow flag. If NoSignedWrap is present
23368 // that is not actually needed.
23369 switch (Op->getOpcode()) {
23370 case ISD::ADD:
23371 case ISD::SUB:
23372 case ISD::MUL:
23373 case ISD::SHL:
23374 if (Op.getNode()->getFlags().hasNoSignedWrap())
23375 break;
23376 [[fallthrough]];
23377 default:
23378 NeedOF = true;
23379 break;
23380 }
23381 break;
23382 }
23383 }
23384 // See if we can use the EFLAGS value from the operand instead of
23385 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23386 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23387 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23388 // Emit a CMP with 0, which is the TEST pattern.
23389 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23390 DAG.getConstant(0, dl, Op.getValueType()));
23391 }
23392 unsigned Opcode = 0;
23393 unsigned NumOperands = 0;
23394
23395 SDValue ArithOp = Op;
23396
23397 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23398 // which may be the result of a CAST. We use the variable 'Op', which is the
23399 // non-casted variable when we check for possible users.
23400 switch (ArithOp.getOpcode()) {
23401 case ISD::AND:
23402 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23403 // because a TEST instruction will be better.
23404 if (!hasNonFlagsUse(Op))
23405 break;
23406
23407 [[fallthrough]];
23408 case ISD::ADD:
23409 case ISD::SUB:
23410 case ISD::OR:
23411 case ISD::XOR:
23413 break;
23414
23415 // Otherwise use a regular EFLAGS-setting instruction.
23416 switch (ArithOp.getOpcode()) {
23417 // clang-format off
23418 default: llvm_unreachable("unexpected operator!");
23419 case ISD::ADD: Opcode = X86ISD::ADD; break;
23420 case ISD::SUB: Opcode = X86ISD::SUB; break;
23421 case ISD::XOR: Opcode = X86ISD::XOR; break;
23422 case ISD::AND: Opcode = X86ISD::AND; break;
23423 case ISD::OR: Opcode = X86ISD::OR; break;
23424 // clang-format on
23425 }
23426
23427 NumOperands = 2;
23428 break;
23429 case X86ISD::ADD:
23430 case X86ISD::SUB:
23431 case X86ISD::OR:
23432 case X86ISD::XOR:
23433 case X86ISD::AND:
23434 return SDValue(Op.getNode(), 1);
23435 case ISD::SSUBO:
23436 case ISD::USUBO: {
23437 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23438 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23439 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23440 Op->getOperand(1)).getValue(1);
23441 }
23442 default:
23443 break;
23444 }
23445
23446 if (Opcode == 0) {
23447 // Emit a CMP with 0, which is the TEST pattern.
23448 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23449 DAG.getConstant(0, dl, Op.getValueType()));
23450 }
23451 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23452 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23453
23454 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23455 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23456 return SDValue(New.getNode(), 1);
23457}
23458
23459/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23460/// equivalent.
23462 const SDLoc &dl, SelectionDAG &DAG,
23463 const X86Subtarget &Subtarget) {
23464 if (isNullConstant(Op1))
23465 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23466
23467 EVT CmpVT = Op0.getValueType();
23468
23469 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23470 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23471
23472 // Only promote the compare up to I32 if it is a 16 bit operation
23473 // with an immediate. 16 bit immediates are to be avoided unless the target
23474 // isn't slowed down by length changing prefixes, we're optimizing for
23475 // codesize or the comparison is with a folded load.
23476 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23477 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23479 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23480 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23481 // Don't do this if the immediate can fit in 8-bits.
23482 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23483 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23484 unsigned ExtendOp =
23486 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23487 // For equality comparisons try to use SIGN_EXTEND if the input was
23488 // truncate from something with enough sign bits.
23489 if (Op0.getOpcode() == ISD::TRUNCATE) {
23490 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23491 ExtendOp = ISD::SIGN_EXTEND;
23492 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23493 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23494 ExtendOp = ISD::SIGN_EXTEND;
23495 }
23496 }
23497
23498 CmpVT = MVT::i32;
23499 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23500 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23501 }
23502 }
23503
23504 // Try to shrink i64 compares if the input has enough zero bits.
23505 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23506 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23507 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23508 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23509 CmpVT = MVT::i32;
23510 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23511 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23512 }
23513
23514 // Try to shrink all i64 compares if the inputs are representable as signed
23515 // i32.
23516 if (CmpVT == MVT::i64 &&
23517 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23518 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23519 CmpVT = MVT::i32;
23520 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23521 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23522 }
23523
23524 // 0-x == y --> x+y == 0
23525 // 0-x != y --> x+y != 0
23526 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23527 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23528 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23529 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23530 return Add.getValue(1);
23531 }
23532
23533 // x == 0-y --> x+y == 0
23534 // x != 0-y --> x+y != 0
23535 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23536 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23537 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23538 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23539 return Add.getValue(1);
23540 }
23541
23542 // If we already have an XOR of the ops, use that to check for equality.
23543 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23544 unsigned X86Opc = X86ISD::SUB;
23545 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23546 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23547 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23548 X86Opc = X86ISD::XOR;
23549
23550 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23551 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23552 return CmpOp.getValue(1);
23553}
23554
23559
23560bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23561 SDNode *N, SDValue, SDValue IntPow2) const {
23562 if (N->getOpcode() == ISD::FDIV)
23563 return true;
23564
23565 EVT FPVT = N->getValueType(0);
23566 EVT IntVT = IntPow2.getValueType();
23567
23568 // This indicates a non-free bitcast.
23569 // TODO: This is probably overly conservative as we will need to scale the
23570 // integer vector anyways for the int->fp cast.
23571 if (FPVT.isVector() &&
23572 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23573 return false;
23574
23575 return true;
23576}
23577
23578/// Check if replacement of SQRT with RSQRT should be disabled.
23579bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23580 EVT VT = Op.getValueType();
23581
23582 // We don't need to replace SQRT with RSQRT for half type.
23583 if (VT.getScalarType() == MVT::f16)
23584 return true;
23585
23586 // We never want to use both SQRT and RSQRT instructions for the same input.
23587 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23588 return false;
23589
23590 if (VT.isVector())
23591 return Subtarget.hasFastVectorFSQRT();
23592 return Subtarget.hasFastScalarFSQRT();
23593}
23594
23595/// The minimum architected relative accuracy is 2^-12. We need one
23596/// Newton-Raphson step to have a good float result (24 bits of precision).
23597SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23598 SelectionDAG &DAG, int Enabled,
23599 int &RefinementSteps,
23600 bool &UseOneConstNR,
23601 bool Reciprocal) const {
23602 SDLoc DL(Op);
23603 EVT VT = Op.getValueType();
23604
23605 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23606 // It is likely not profitable to do this for f64 because a double-precision
23607 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23608 // instructions: convert to single, rsqrtss, convert back to double, refine
23609 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23610 // along with FMA, this could be a throughput win.
23611 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23612 // after legalize types.
23613 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23614 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23615 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23616 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23617 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23618 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23619 RefinementSteps = 1;
23620
23621 UseOneConstNR = false;
23622 // There is no FSQRT for 512-bits, but there is RSQRT14.
23623 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23624 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23625 if (RefinementSteps == 0 && !Reciprocal)
23626 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23627 return Estimate;
23628 }
23629
23630 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23631 Subtarget.hasFP16()) {
23632 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23633 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23634 RefinementSteps = 0;
23635
23636 if (VT == MVT::f16) {
23638 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23639 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23640 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23641 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23642 }
23643
23644 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23645 }
23646 return SDValue();
23647}
23648
23649/// The minimum architected relative accuracy is 2^-12. We need one
23650/// Newton-Raphson step to have a good float result (24 bits of precision).
23651SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23652 int Enabled,
23653 int &RefinementSteps) const {
23654 SDLoc DL(Op);
23655 EVT VT = Op.getValueType();
23656
23657 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23658 // It is likely not profitable to do this for f64 because a double-precision
23659 // reciprocal estimate with refinement on x86 prior to FMA requires
23660 // 15 instructions: convert to single, rcpss, convert back to double, refine
23661 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23662 // along with FMA, this could be a throughput win.
23663
23664 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23665 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23666 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23667 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23668 // Enable estimate codegen with 1 refinement step for vector division.
23669 // Scalar division estimates are disabled because they break too much
23670 // real-world code. These defaults are intended to match GCC behavior.
23671 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23672 return SDValue();
23673
23674 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23675 RefinementSteps = 1;
23676
23677 // There is no FSQRT for 512-bits, but there is RCP14.
23678 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23679 return DAG.getNode(Opcode, DL, VT, Op);
23680 }
23681
23682 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23683 Subtarget.hasFP16()) {
23684 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23685 RefinementSteps = 0;
23686
23687 if (VT == MVT::f16) {
23689 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23690 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23691 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23692 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23693 }
23694
23695 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23696 }
23697 return SDValue();
23698}
23699
23700/// If we have at least two divisions that use the same divisor, convert to
23701/// multiplication by a reciprocal. This may need to be adjusted for a given
23702/// CPU if a division's cost is not at least twice the cost of a multiplication.
23703/// This is because we still need one division to calculate the reciprocal and
23704/// then we need two multiplies by that reciprocal as replacements for the
23705/// original divisions.
23707 return 2;
23708}
23709
23710SDValue
23711X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23712 SelectionDAG &DAG,
23713 SmallVectorImpl<SDNode *> &Created) const {
23714 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23715 if (isIntDivCheap(N->getValueType(0), Attr))
23716 return SDValue(N,0); // Lower SDIV as SDIV
23717
23718 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23719 "Unexpected divisor!");
23720
23721 // Only perform this transform if CMOV is supported otherwise the select
23722 // below will become a branch.
23723 if (!Subtarget.canUseCMOV())
23724 return SDValue();
23725
23726 // fold (sdiv X, pow2)
23727 EVT VT = N->getValueType(0);
23728 // FIXME: Support i8.
23729 if (VT != MVT::i16 && VT != MVT::i32 &&
23730 !(Subtarget.is64Bit() && VT == MVT::i64))
23731 return SDValue();
23732
23733 // If the divisor is 2 or -2, the default expansion is better.
23734 if (Divisor == 2 ||
23735 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23736 return SDValue();
23737
23738 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23739}
23740
23741/// Result of 'and' is compared against zero. Change to a BT node if possible.
23742/// Returns the BT node and the condition code needed to use it.
23744 SelectionDAG &DAG, X86::CondCode &X86CC) {
23745 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23746 SDValue Op0 = And.getOperand(0);
23747 SDValue Op1 = And.getOperand(1);
23748 if (Op0.getOpcode() == ISD::TRUNCATE)
23749 Op0 = Op0.getOperand(0);
23750 if (Op1.getOpcode() == ISD::TRUNCATE)
23751 Op1 = Op1.getOperand(0);
23752
23753 SDValue Src, BitNo;
23754 if (Op1.getOpcode() == ISD::SHL)
23755 std::swap(Op0, Op1);
23756 if (Op0.getOpcode() == ISD::SHL) {
23757 if (isOneConstant(Op0.getOperand(0))) {
23758 // If we looked past a truncate, check that it's only truncating away
23759 // known zeros.
23760 unsigned BitWidth = Op0.getValueSizeInBits();
23761 unsigned AndBitWidth = And.getValueSizeInBits();
23762 if (BitWidth > AndBitWidth) {
23763 KnownBits Known = DAG.computeKnownBits(Op0);
23764 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23765 return SDValue();
23766 }
23767 Src = Op1;
23768 BitNo = Op0.getOperand(1);
23769 }
23770 } else if (Op1.getOpcode() == ISD::Constant) {
23771 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23772 uint64_t AndRHSVal = AndRHS->getZExtValue();
23773 SDValue AndLHS = Op0;
23774
23775 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23776 Src = AndLHS.getOperand(0);
23777 BitNo = AndLHS.getOperand(1);
23778 } else {
23779 // Use BT if the immediate can't be encoded in a TEST instruction or we
23780 // are optimizing for size and the immedaite won't fit in a byte.
23781 bool OptForSize = DAG.shouldOptForSize();
23782 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23783 isPowerOf2_64(AndRHSVal)) {
23784 Src = AndLHS;
23785 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23786 Src.getValueType());
23787 }
23788 }
23789 }
23790
23791 // No patterns found, give up.
23792 if (!Src.getNode())
23793 return SDValue();
23794
23795 // Remove any bit flip.
23796 if (isBitwiseNot(Src)) {
23797 Src = Src.getOperand(0);
23798 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23799 }
23800
23801 // Attempt to create the X86ISD::BT node.
23802 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23803 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23804 return BT;
23805 }
23806
23807 return SDValue();
23808}
23809
23810// Check if pre-AVX condcode can be performed by a single FCMP op.
23811static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23812 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23813}
23814
23815/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23816/// CMPs.
23817static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23818 SDValue &Op1, bool &IsAlwaysSignaling) {
23819 unsigned SSECC;
23820 bool Swap = false;
23821
23822 // SSE Condition code mapping:
23823 // 0 - EQ
23824 // 1 - LT
23825 // 2 - LE
23826 // 3 - UNORD
23827 // 4 - NEQ
23828 // 5 - NLT
23829 // 6 - NLE
23830 // 7 - ORD
23831 switch (SetCCOpcode) {
23832 // clang-format off
23833 default: llvm_unreachable("Unexpected SETCC condition");
23834 case ISD::SETOEQ:
23835 case ISD::SETEQ: SSECC = 0; break;
23836 case ISD::SETOGT:
23837 case ISD::SETGT: Swap = true; [[fallthrough]];
23838 case ISD::SETLT:
23839 case ISD::SETOLT: SSECC = 1; break;
23840 case ISD::SETOGE:
23841 case ISD::SETGE: Swap = true; [[fallthrough]];
23842 case ISD::SETLE:
23843 case ISD::SETOLE: SSECC = 2; break;
23844 case ISD::SETUO: SSECC = 3; break;
23845 case ISD::SETUNE:
23846 case ISD::SETNE: SSECC = 4; break;
23847 case ISD::SETULE: Swap = true; [[fallthrough]];
23848 case ISD::SETUGE: SSECC = 5; break;
23849 case ISD::SETULT: Swap = true; [[fallthrough]];
23850 case ISD::SETUGT: SSECC = 6; break;
23851 case ISD::SETO: SSECC = 7; break;
23852 case ISD::SETUEQ: SSECC = 8; break;
23853 case ISD::SETONE: SSECC = 12; break;
23854 // clang-format on
23855 }
23856 if (Swap)
23857 std::swap(Op0, Op1);
23858
23859 switch (SetCCOpcode) {
23860 default:
23861 IsAlwaysSignaling = true;
23862 break;
23863 case ISD::SETEQ:
23864 case ISD::SETOEQ:
23865 case ISD::SETUEQ:
23866 case ISD::SETNE:
23867 case ISD::SETONE:
23868 case ISD::SETUNE:
23869 case ISD::SETO:
23870 case ISD::SETUO:
23871 IsAlwaysSignaling = false;
23872 break;
23873 }
23874
23875 return SSECC;
23876}
23877
23878/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23879/// concatenate the result back.
23881 SelectionDAG &DAG, const SDLoc &dl) {
23882 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23883 "Unsupported VTs!");
23884 SDValue CC = DAG.getCondCode(Cond);
23885
23886 // Extract the LHS Lo/Hi vectors
23887 SDValue LHS1, LHS2;
23888 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23889
23890 // Extract the RHS Lo/Hi vectors
23891 SDValue RHS1, RHS2;
23892 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23893
23894 // Issue the operation on the smaller types and concatenate the result back
23895 EVT LoVT, HiVT;
23896 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23897 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23898 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23899 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23900}
23901
23903 SelectionDAG &DAG) {
23904 SDValue Op0 = Op.getOperand(0);
23905 SDValue Op1 = Op.getOperand(1);
23906 SDValue CC = Op.getOperand(2);
23907 MVT VT = Op.getSimpleValueType();
23908 assert(VT.getVectorElementType() == MVT::i1 &&
23909 "Cannot set masked compare for this operation");
23910
23911 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23912
23913 // Prefer SETGT over SETLT.
23914 if (SetCCOpcode == ISD::SETLT) {
23915 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23916 std::swap(Op0, Op1);
23917 }
23918
23919 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23920}
23921
23922/// Given a buildvector constant, return a new vector constant with each element
23923/// incremented or decremented. If incrementing or decrementing would result in
23924/// unsigned overflow or underflow or this is not a simple vector constant,
23925/// return an empty value.
23927 bool NSW) {
23928 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23929 if (!BV || !V.getValueType().isSimple())
23930 return SDValue();
23931
23932 MVT VT = V.getSimpleValueType();
23933 MVT EltVT = VT.getVectorElementType();
23934 unsigned NumElts = VT.getVectorNumElements();
23936 SDLoc DL(V);
23937 for (unsigned i = 0; i < NumElts; ++i) {
23938 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23939 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23940 return SDValue();
23941
23942 // Avoid overflow/underflow.
23943 const APInt &EltC = Elt->getAPIntValue();
23944 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23945 return SDValue();
23946 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23947 (!IsInc && EltC.isMinSignedValue())))
23948 return SDValue();
23949
23950 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23951 }
23952
23953 return DAG.getBuildVector(VT, DL, NewVecC);
23954}
23955
23956/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23957/// Op0 u<= Op1:
23958/// t = psubus Op0, Op1
23959/// pcmpeq t, <0..0>
23961 ISD::CondCode Cond, const SDLoc &dl,
23962 const X86Subtarget &Subtarget,
23963 SelectionDAG &DAG) {
23964 if (!Subtarget.hasSSE2())
23965 return SDValue();
23966
23967 MVT VET = VT.getVectorElementType();
23968 if (VET != MVT::i8 && VET != MVT::i16)
23969 return SDValue();
23970
23971 switch (Cond) {
23972 default:
23973 return SDValue();
23974 case ISD::SETULT: {
23975 // If the comparison is against a constant we can turn this into a
23976 // setule. With psubus, setule does not require a swap. This is
23977 // beneficial because the constant in the register is no longer
23978 // destructed as the destination so it can be hoisted out of a loop.
23979 // Only do this pre-AVX since vpcmp* is no longer destructive.
23980 if (Subtarget.hasAVX())
23981 return SDValue();
23982 SDValue ULEOp1 =
23983 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23984 if (!ULEOp1)
23985 return SDValue();
23986 Op1 = ULEOp1;
23987 break;
23988 }
23989 case ISD::SETUGT: {
23990 // If the comparison is against a constant, we can turn this into a setuge.
23991 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23992 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23993 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23994 SDValue UGEOp1 =
23995 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23996 if (!UGEOp1)
23997 return SDValue();
23998 Op1 = Op0;
23999 Op0 = UGEOp1;
24000 break;
24001 }
24002 // Psubus is better than flip-sign because it requires no inversion.
24003 case ISD::SETUGE:
24004 std::swap(Op0, Op1);
24005 break;
24006 case ISD::SETULE:
24007 break;
24008 }
24009
24010 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24011 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24012 DAG.getConstant(0, dl, VT));
24013}
24014
24015static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24016 SelectionDAG &DAG) {
24017 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24018 Op.getOpcode() == ISD::STRICT_FSETCCS;
24019 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24020 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24021 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24022 MVT VT = Op->getSimpleValueType(0);
24024 MVT OpVT = Op0.getSimpleValueType();
24025 SDLoc dl(Op);
24026
24027 if (OpVT.isFloatingPoint()) {
24028 MVT EltVT = OpVT.getVectorElementType();
24029 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24030 EltVT == MVT::f64);
24031
24032 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24033 if (isSoftF16(EltVT, Subtarget)) {
24034 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24035 return SDValue();
24036
24037 // Break 256-bit FP vector compare into smaller ones.
24038 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24039 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24040
24041 // Break 512-bit FP vector compare into smaller ones.
24042 if (OpVT.is512BitVector())
24043 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24044
24045 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24046 if (IsStrict) {
24047 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24048 {Chain, Op0});
24049 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24050 {Chain, Op1});
24051 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24052 {Chain, Op0, Op1, CC});
24053 }
24054 MVT DVT = VT.getVectorElementType() == MVT::i16
24055 ? VT.changeVectorElementType(MVT::i32)
24056 : VT;
24057 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24058 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24059 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24060 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24061 }
24062
24063 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24064
24065 // If we have a strict compare with a vXi1 result and the input is 128/256
24066 // bits we can't use a masked compare unless we have VLX. If we use a wider
24067 // compare like we do for non-strict, we might trigger spurious exceptions
24068 // from the upper elements. Instead emit a AVX compare and convert to mask.
24069 unsigned Opc;
24070 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24071 (!IsStrict || Subtarget.hasVLX() ||
24073#ifndef NDEBUG
24074 unsigned Num = VT.getVectorNumElements();
24075 assert(Num <= 16 ||
24076 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24077#endif
24078 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24079 } else {
24080 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24081 // The SSE/AVX packed FP comparison nodes are defined with a
24082 // floating-point vector result that matches the operand type. This allows
24083 // them to work with an SSE1 target (integer vector types are not legal).
24084 VT = Op0.getSimpleValueType();
24085 }
24086
24087 SDValue Cmp;
24088 bool IsAlwaysSignaling;
24089 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24090 if (!Subtarget.hasAVX()) {
24091 // TODO: We could use following steps to handle a quiet compare with
24092 // signaling encodings.
24093 // 1. Get ordered masks from a quiet ISD::SETO
24094 // 2. Use the masks to mask potential unordered elements in operand A, B
24095 // 3. Get the compare results of masked A, B
24096 // 4. Calculating final result using the mask and result from 3
24097 // But currently, we just fall back to scalar operations.
24098 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24099 return SDValue();
24100
24101 // Insert an extra signaling instruction to raise exception.
24102 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24103 SDValue SignalCmp = DAG.getNode(
24104 Opc, dl, {VT, MVT::Other},
24105 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24106 // FIXME: It seems we need to update the flags of all new strict nodes.
24107 // Otherwise, mayRaiseFPException in MI will return false due to
24108 // NoFPExcept = false by default. However, I didn't find it in other
24109 // patches.
24110 SignalCmp->setFlags(Op->getFlags());
24111 Chain = SignalCmp.getValue(1);
24112 }
24113
24114 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24115 // emit two comparisons and a logic op to tie them together.
24116 if (!cheapX86FSETCC_SSE(Cond)) {
24117 // LLVM predicate is SETUEQ or SETONE.
24118 unsigned CC0, CC1;
24119 unsigned CombineOpc;
24120 if (Cond == ISD::SETUEQ) {
24121 CC0 = 3; // UNORD
24122 CC1 = 0; // EQ
24123 CombineOpc = X86ISD::FOR;
24124 } else {
24126 CC0 = 7; // ORD
24127 CC1 = 4; // NEQ
24128 CombineOpc = X86ISD::FAND;
24129 }
24130
24131 SDValue Cmp0, Cmp1;
24132 if (IsStrict) {
24133 Cmp0 = DAG.getNode(
24134 Opc, dl, {VT, MVT::Other},
24135 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24136 Cmp1 = DAG.getNode(
24137 Opc, dl, {VT, MVT::Other},
24138 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24139 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24140 Cmp1.getValue(1));
24141 } else {
24142 Cmp0 = DAG.getNode(
24143 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24144 Cmp1 = DAG.getNode(
24145 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24146 }
24147 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24148 } else {
24149 if (IsStrict) {
24150 Cmp = DAG.getNode(
24151 Opc, dl, {VT, MVT::Other},
24152 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24153 Chain = Cmp.getValue(1);
24154 } else
24155 Cmp = DAG.getNode(
24156 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24157 }
24158 } else {
24159 // Handle all other FP comparisons here.
24160 if (IsStrict) {
24161 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24162 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24163 Cmp = DAG.getNode(
24164 Opc, dl, {VT, MVT::Other},
24165 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24166 Chain = Cmp.getValue(1);
24167 } else
24168 Cmp = DAG.getNode(
24169 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24170 }
24171
24172 if (VT.getFixedSizeInBits() >
24173 Op.getSimpleValueType().getFixedSizeInBits()) {
24174 // We emitted a compare with an XMM/YMM result. Finish converting to a
24175 // mask register using a vptestm.
24177 Cmp = DAG.getBitcast(CastVT, Cmp);
24178 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24179 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24180 } else {
24181 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24182 // the result type of SETCC. The bitcast is expected to be optimized
24183 // away during combining/isel.
24184 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24185 }
24186
24187 if (IsStrict)
24188 return DAG.getMergeValues({Cmp, Chain}, dl);
24189
24190 return Cmp;
24191 }
24192
24193 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24194
24195 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24196 assert(VTOp0 == Op1.getSimpleValueType() &&
24197 "Expected operands with same type!");
24199 "Invalid number of packed elements for source and destination!");
24200
24201 // The non-AVX512 code below works under the assumption that source and
24202 // destination types are the same.
24203 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24204 "Value types for source and destination must be the same!");
24205
24206 // The result is boolean, but operands are int/float
24207 if (VT.getVectorElementType() == MVT::i1) {
24208 // In AVX-512 architecture setcc returns mask with i1 elements,
24209 // But there is no compare instruction for i8 and i16 elements in KNL.
24210 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24211 "Unexpected operand type");
24212 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24213 }
24214
24215 // Lower using XOP integer comparisons.
24216 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24217 // Translate compare code to XOP PCOM compare mode.
24218 unsigned CmpMode = 0;
24219 switch (Cond) {
24220 // clang-format off
24221 default: llvm_unreachable("Unexpected SETCC condition");
24222 case ISD::SETULT:
24223 case ISD::SETLT: CmpMode = 0x00; break;
24224 case ISD::SETULE:
24225 case ISD::SETLE: CmpMode = 0x01; break;
24226 case ISD::SETUGT:
24227 case ISD::SETGT: CmpMode = 0x02; break;
24228 case ISD::SETUGE:
24229 case ISD::SETGE: CmpMode = 0x03; break;
24230 case ISD::SETEQ: CmpMode = 0x04; break;
24231 case ISD::SETNE: CmpMode = 0x05; break;
24232 // clang-format on
24233 }
24234
24235 // Are we comparing unsigned or signed integers?
24236 unsigned Opc =
24238
24239 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24240 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24241 }
24242
24243 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24244 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24246 SDValue BC0 = peekThroughBitcasts(Op0);
24247 if (BC0.getOpcode() == ISD::AND &&
24249 /*AllowUndefs=*/false)) {
24250 Cond = ISD::SETEQ;
24251 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24252 }
24253 }
24254
24255 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24256 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24257 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24259 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24260 unsigned BitWidth = VT.getScalarSizeInBits();
24261 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24262
24263 SDValue Result = Op0.getOperand(0);
24264 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24265 DAG.getConstant(ShiftAmt, dl, VT));
24266 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24267 DAG.getConstant(BitWidth - 1, dl, VT));
24268 return Result;
24269 }
24270 }
24271
24272 // Break 256-bit integer vector compare into smaller ones.
24273 if (VT.is256BitVector() && !Subtarget.hasInt256())
24274 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24275
24276 // Break 512-bit integer vector compare into smaller ones.
24277 // TODO: Try harder to use VPCMPx + VPMOV2x?
24278 if (VT.is512BitVector())
24279 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24280
24281 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24282 // not-of-PCMPEQ:
24283 // X != INT_MIN --> X >s INT_MIN
24284 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24285 // +X != 0 --> +X >s 0
24286 APInt ConstValue;
24287 if (Cond == ISD::SETNE &&
24288 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24289 if (ConstValue.isMinSignedValue())
24290 Cond = ISD::SETGT;
24291 else if (ConstValue.isMaxSignedValue())
24292 Cond = ISD::SETLT;
24293 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24294 Cond = ISD::SETGT;
24295 }
24296
24297 // If both operands are known non-negative, then an unsigned compare is the
24298 // same as a signed compare and there's no need to flip signbits.
24299 // TODO: We could check for more general simplifications here since we're
24300 // computing known bits.
24301 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24302 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24303
24304 // Special case: Use min/max operations for unsigned compares.
24305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24307 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24308 TLI.isOperationLegal(ISD::UMIN, VT)) {
24309 // If we have a constant operand, increment/decrement it and change the
24310 // condition to avoid an invert.
24311 if (Cond == ISD::SETUGT) {
24312 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24313 if (SDValue UGTOp1 =
24314 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24315 Op1 = UGTOp1;
24316 Cond = ISD::SETUGE;
24317 }
24318 }
24319 if (Cond == ISD::SETULT) {
24320 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24321 if (SDValue ULTOp1 =
24322 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24323 Op1 = ULTOp1;
24324 Cond = ISD::SETULE;
24325 }
24326 }
24327 bool Invert = false;
24328 unsigned Opc;
24329 switch (Cond) {
24330 // clang-format off
24331 default: llvm_unreachable("Unexpected condition code");
24332 case ISD::SETUGT: Invert = true; [[fallthrough]];
24333 case ISD::SETULE: Opc = ISD::UMIN; break;
24334 case ISD::SETULT: Invert = true; [[fallthrough]];
24335 case ISD::SETUGE: Opc = ISD::UMAX; break;
24336 // clang-format on
24337 }
24338
24339 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24340 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24341
24342 // If the logical-not of the result is required, perform that now.
24343 if (Invert)
24344 Result = DAG.getNOT(dl, Result, VT);
24345
24346 return Result;
24347 }
24348
24349 // Try to use SUBUS and PCMPEQ.
24350 if (FlipSigns)
24351 if (SDValue V =
24352 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24353 return V;
24354
24355 // We are handling one of the integer comparisons here. Since SSE only has
24356 // GT and EQ comparisons for integer, swapping operands and multiple
24357 // operations may be required for some comparisons.
24358 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24360 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24362 bool Invert = Cond == ISD::SETNE ||
24364
24365 if (Swap)
24366 std::swap(Op0, Op1);
24367
24368 // Check that the operation in question is available (most are plain SSE2,
24369 // but PCMPGTQ and PCMPEQQ have different requirements).
24370 if (VT == MVT::v2i64) {
24371 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24372 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24373
24374 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24375 // the odd elements over the even elements.
24376 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24377 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24378 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24379
24380 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24381 static const int MaskHi[] = { 1, 1, 3, 3 };
24382 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24383
24384 return DAG.getBitcast(VT, Result);
24385 }
24386
24387 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24388 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24389 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24390
24391 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24392 static const int MaskHi[] = { 1, 1, 3, 3 };
24393 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24394
24395 return DAG.getBitcast(VT, Result);
24396 }
24397
24398 // If the i64 elements are sign-extended enough to be representable as i32
24399 // then we can compare the lower i32 bits and splat.
24400 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24401 DAG.ComputeNumSignBits(Op1) > 32) {
24402 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24403 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24404
24405 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24406 static const int MaskLo[] = {0, 0, 2, 2};
24407 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24408
24409 return DAG.getBitcast(VT, Result);
24410 }
24411
24412 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24413 // bits of the inputs before performing those operations. The lower
24414 // compare is always unsigned.
24415 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24416 : 0x0000000080000000ULL,
24417 dl, MVT::v2i64);
24418
24419 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24420 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24421
24422 // Cast everything to the right type.
24423 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24424 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24425
24426 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24427 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24428 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24429
24430 // Create masks for only the low parts/high parts of the 64 bit integers.
24431 static const int MaskHi[] = { 1, 1, 3, 3 };
24432 static const int MaskLo[] = { 0, 0, 2, 2 };
24433 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24434 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24435 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24436
24437 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24438 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24439
24440 if (Invert)
24441 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24442
24443 return DAG.getBitcast(VT, Result);
24444 }
24445
24446 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24447 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24448 // pcmpeqd + pshufd + pand.
24449 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24450
24451 // First cast everything to the right type.
24452 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24453 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24454
24455 // Do the compare.
24456 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24457
24458 // Make sure the lower and upper halves are both all-ones.
24459 static const int Mask[] = { 1, 0, 3, 2 };
24460 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24461 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24462
24463 if (Invert)
24464 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24465
24466 return DAG.getBitcast(VT, Result);
24467 }
24468 }
24469
24470 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24471 // bits of the inputs before performing those operations.
24472 if (FlipSigns) {
24473 MVT EltVT = VT.getVectorElementType();
24475 VT);
24476 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24477 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24478 }
24479
24480 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24481
24482 // If the logical-not of the result is required, perform that now.
24483 if (Invert)
24484 Result = DAG.getNOT(dl, Result, VT);
24485
24486 return Result;
24487}
24488
24489// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24491 const SDLoc &dl, SelectionDAG &DAG,
24492 const X86Subtarget &Subtarget,
24493 SDValue &X86CC) {
24494 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24495
24496 // Must be a bitcast from vXi1.
24497 if (Op0.getOpcode() != ISD::BITCAST)
24498 return SDValue();
24499
24500 Op0 = Op0.getOperand(0);
24501 MVT VT = Op0.getSimpleValueType();
24502 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24503 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24504 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24505 return SDValue();
24506
24507 X86::CondCode X86Cond;
24508 if (isNullConstant(Op1)) {
24509 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24510 } else if (isAllOnesConstant(Op1)) {
24511 // C flag is set for all ones.
24512 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24513 } else
24514 return SDValue();
24515
24516 // If the input is an AND, we can combine it's operands into the KTEST.
24517 bool KTestable = false;
24518 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24519 KTestable = true;
24520 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24521 KTestable = true;
24522 if (!isNullConstant(Op1))
24523 KTestable = false;
24524 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24525 SDValue LHS = Op0.getOperand(0);
24526 SDValue RHS = Op0.getOperand(1);
24527 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24528 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24529 }
24530
24531 // If the input is an OR, we can combine it's operands into the KORTEST.
24532 SDValue LHS = Op0;
24533 SDValue RHS = Op0;
24534 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24535 LHS = Op0.getOperand(0);
24536 RHS = Op0.getOperand(1);
24537 }
24538
24539 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24540 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24541}
24542
24543/// Emit flags for the given setcc condition and operands. Also returns the
24544/// corresponding X86 condition code constant in X86CC.
24545SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24546 ISD::CondCode CC, const SDLoc &dl,
24547 SelectionDAG &DAG,
24548 SDValue &X86CC) const {
24549 // Equality Combines.
24550 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24551 X86::CondCode X86CondCode;
24552
24553 // Optimize to BT if possible.
24554 // Lower (X & (1 << N)) == 0 to BT(X, N).
24555 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24556 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24557 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24558 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24559 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24560 return BT;
24561 }
24562 }
24563
24564 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24565 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24566 X86CondCode)) {
24567 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24568 return CmpZ;
24569 }
24570
24571 // Try to lower using KORTEST or KTEST.
24572 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24573 return Test;
24574
24575 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24576 // of these.
24577 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24578 // If the input is a setcc, then reuse the input setcc or use a new one
24579 // with the inverted condition.
24580 if (Op0.getOpcode() == X86ISD::SETCC) {
24581 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24582
24583 X86CC = Op0.getOperand(0);
24584 if (Invert) {
24585 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24586 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24587 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24588 }
24589
24590 return Op0.getOperand(1);
24591 }
24592 }
24593
24594 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24595 // overflow.
24596 if (isMinSignedConstant(Op1)) {
24597 EVT VT = Op0.getValueType();
24598 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24599 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24601 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24602 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24603 DAG.getConstant(0, dl, VT), Op0);
24604 return SDValue(Neg.getNode(), 1);
24605 }
24606 }
24607
24608 // Try to use the carry flag from the add in place of an separate CMP for:
24609 // (seteq (add X, -1), -1). Similar for setne.
24610 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24611 Op0.getOperand(1) == Op1) {
24612 if (isProfitableToUseFlagOp(Op0)) {
24613 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24614
24615 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24616 Op0.getOperand(1));
24617 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24618 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24619 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24620 return SDValue(New.getNode(), 1);
24621 }
24622 }
24623 }
24624
24626 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24627 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24628
24629 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24630 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24631 return EFLAGS;
24632}
24633
24634SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24635
24636 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24637 Op.getOpcode() == ISD::STRICT_FSETCCS;
24638 MVT VT = Op->getSimpleValueType(0);
24639
24640 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24641
24642 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24643 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24644 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24645 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24646 SDLoc dl(Op);
24647 ISD::CondCode CC =
24648 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24649
24650 if (isSoftF16(Op0.getValueType(), Subtarget))
24651 return SDValue();
24652
24653 // Handle f128 first, since one possible outcome is a normal integer
24654 // comparison which gets handled by emitFlagsForSetcc.
24655 if (Op0.getValueType() == MVT::f128) {
24656 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24657 Op.getOpcode() == ISD::STRICT_FSETCCS);
24658
24659 // If softenSetCCOperands returned a scalar, use it.
24660 if (!Op1.getNode()) {
24661 assert(Op0.getValueType() == Op.getValueType() &&
24662 "Unexpected setcc expansion!");
24663 if (IsStrict)
24664 return DAG.getMergeValues({Op0, Chain}, dl);
24665 return Op0;
24666 }
24667 }
24668
24669 if (Op0.getSimpleValueType().isInteger()) {
24670 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24671 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24672 // this may translate to less uops depending on uarch implementation. The
24673 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24674 // canonicalize to that CondCode.
24675 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24676 // encoding size - so it must either already be a i8 or i32 immediate, or it
24677 // shrinks down to that. We don't do this for any i64's to avoid additional
24678 // constant materializations.
24679 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24680 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24681 const APInt &Op1Val = Op1C->getAPIntValue();
24682 if (!Op1Val.isZero()) {
24683 // Ensure the constant+1 doesn't overflow.
24684 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24685 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24686 APInt Op1ValPlusOne = Op1Val + 1;
24687 if (Op1ValPlusOne.isSignedIntN(32) &&
24688 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24689 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24692 }
24693 }
24694 }
24695 }
24696
24697 SDValue X86CC;
24698 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24699 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24700 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24701 }
24702
24703 if (Subtarget.hasAVX10_2()) {
24704 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24705 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24706 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24707 if (Op0.getSimpleValueType() != MVT::f80) {
24708 SDValue Res = getSETCC(
24709 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24710 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24711 }
24712 }
24713 }
24714 // Handle floating point.
24715 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24716 if (CondCode == X86::COND_INVALID)
24717 return SDValue();
24718
24719 SDValue EFLAGS;
24720 if (IsStrict) {
24721 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24722 EFLAGS =
24724 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24725 Chain = EFLAGS.getValue(1);
24726 } else {
24727 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24728 }
24729
24730 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24731 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24732 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24733}
24734
24735SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24736 SDValue LHS = Op.getOperand(0);
24737 SDValue RHS = Op.getOperand(1);
24738 SDValue Carry = Op.getOperand(2);
24739 SDValue Cond = Op.getOperand(3);
24740 SDLoc DL(Op);
24741
24742 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24744
24745 // Recreate the carry if needed.
24746 EVT CarryVT = Carry.getValueType();
24747 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24748 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24749
24750 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24751 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24752 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24753}
24754
24755// This function returns three things: the arithmetic computation itself
24756// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24757// flag and the condition code define the case in which the arithmetic
24758// computation overflows.
24759static std::pair<SDValue, SDValue>
24761 assert(Op.getResNo() == 0 && "Unexpected result number!");
24762 SDValue Value, Overflow;
24763 SDValue LHS = Op.getOperand(0);
24764 SDValue RHS = Op.getOperand(1);
24765 unsigned BaseOp = 0;
24766 SDLoc DL(Op);
24767 switch (Op.getOpcode()) {
24768 default: llvm_unreachable("Unknown ovf instruction!");
24769 case ISD::SADDO:
24770 BaseOp = X86ISD::ADD;
24771 Cond = X86::COND_O;
24772 break;
24773 case ISD::UADDO:
24774 BaseOp = X86ISD::ADD;
24776 break;
24777 case ISD::SSUBO:
24778 BaseOp = X86ISD::SUB;
24779 Cond = X86::COND_O;
24780 break;
24781 case ISD::USUBO:
24782 BaseOp = X86ISD::SUB;
24783 Cond = X86::COND_B;
24784 break;
24785 case ISD::SMULO:
24786 BaseOp = X86ISD::SMUL;
24787 Cond = X86::COND_O;
24788 break;
24789 case ISD::UMULO:
24790 BaseOp = X86ISD::UMUL;
24791 Cond = X86::COND_O;
24792 break;
24793 }
24794
24795 if (BaseOp) {
24796 // Also sets EFLAGS.
24797 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24798 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24799 Overflow = Value.getValue(1);
24800 }
24801
24802 return std::make_pair(Value, Overflow);
24803}
24804
24806 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24807 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24808 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24809 // has only one use.
24810 SDLoc DL(Op);
24812 SDValue Value, Overflow;
24813 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24814
24815 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24816 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24817 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24818}
24819
24820/// Return true if opcode is a X86 logical comparison.
24822 unsigned Opc = Op.getOpcode();
24823 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24824 Opc == X86ISD::FCMP)
24825 return true;
24826 if (Op.getResNo() == 1 &&
24827 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24829 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24830 return true;
24831
24832 return false;
24833}
24834
24836 if (V.getOpcode() != ISD::TRUNCATE)
24837 return false;
24838
24839 SDValue VOp0 = V.getOperand(0);
24840 unsigned InBits = VOp0.getValueSizeInBits();
24841 unsigned Bits = V.getValueSizeInBits();
24842 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24843}
24844
24845// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24847 unsigned X86CC, const SDLoc &DL,
24848 SelectionDAG &DAG,
24849 const X86Subtarget &Subtarget) {
24850 EVT CmpVT = CmpVal.getValueType();
24851 EVT VT = LHS.getValueType();
24852 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24853 return SDValue();
24854
24855 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24856 isOneConstant(CmpVal.getOperand(1))) {
24857 auto SplatLSB = [&](EVT SplatVT) {
24858 // we need mask of all zeros or ones with same size of the other
24859 // operands.
24860 SDValue Neg = CmpVal;
24861 if (CmpVT.bitsGT(SplatVT))
24862 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24863 else if (CmpVT.bitsLT(SplatVT))
24864 Neg = DAG.getNode(
24865 ISD::AND, DL, SplatVT,
24866 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24867 DAG.getConstant(1, DL, SplatVT));
24868 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24869 };
24870
24871 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24873 return SplatLSB(VT);
24874
24875 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24876 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24878 SDValue Mask = SplatLSB(VT);
24879 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24880 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24881 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24882 }
24883
24884 SDValue Src1, Src2;
24885 auto isIdentityPatternZero = [&]() {
24886 switch (RHS.getOpcode()) {
24887 default:
24888 break;
24889 case ISD::OR:
24890 case ISD::XOR:
24891 case ISD::ADD:
24892 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24893 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24894 Src2 = LHS;
24895 return true;
24896 }
24897 break;
24898 case ISD::SHL:
24899 case ISD::SRA:
24900 case ISD::SRL:
24901 case ISD::SUB:
24902 if (RHS.getOperand(0) == LHS) {
24903 Src1 = RHS.getOperand(1);
24904 Src2 = LHS;
24905 return true;
24906 }
24907 break;
24908 }
24909 return false;
24910 };
24911
24912 auto isIdentityPatternOnes = [&]() {
24913 switch (LHS.getOpcode()) {
24914 default:
24915 break;
24916 case ISD::AND:
24917 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24918 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24919 Src2 = RHS;
24920 return true;
24921 }
24922 break;
24923 }
24924 return false;
24925 };
24926
24927 // Convert 'identity' patterns (iff X is 0 or 1):
24928 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24929 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24930 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24931 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24932 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24933 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24934 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24935 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24936 SDValue Mask = SplatLSB(Src1.getValueType());
24937 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24938 Src1); // Mask & z
24939 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24940 }
24941 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24942 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24943 SDValue Mask = SplatLSB(VT);
24944 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24945 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24946 }
24947 }
24948
24949 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24952 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24953
24954 // 'X - 1' sets the carry flag if X == 0.
24955 // '0 - X' sets the carry flag if X != 0.
24956 // Convert the carry flag to a -1/0 mask with sbb:
24957 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24958 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24959 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24960 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24961 SDValue Sub;
24962 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24963 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24964 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24965 } else {
24966 SDValue One = DAG.getConstant(1, DL, CmpVT);
24967 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
24968 }
24969 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
24970 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24971 Sub.getValue(1));
24972 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24973 }
24974
24975 return SDValue();
24976}
24977
24978SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24979 bool AddTest = true;
24980 SDValue Cond = Op.getOperand(0);
24981 SDValue Op1 = Op.getOperand(1);
24982 SDValue Op2 = Op.getOperand(2);
24983 SDLoc DL(Op);
24984 MVT VT = Op1.getSimpleValueType();
24985 SDValue CC;
24986
24987 if (isSoftF16(VT, Subtarget)) {
24988 MVT NVT = VT.changeTypeToInteger();
24989 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24990 DAG.getBitcast(NVT, Op1),
24991 DAG.getBitcast(NVT, Op2)));
24992 }
24993
24994 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24995 // are available or VBLENDV if AVX is available.
24996 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24997 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24998 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24999 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25000 bool IsAlwaysSignaling;
25001 unsigned SSECC =
25002 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25003 CondOp0, CondOp1, IsAlwaysSignaling);
25004
25005 if (Subtarget.hasAVX512()) {
25006 SDValue Cmp =
25007 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25008 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25009 assert(!VT.isVector() && "Not a scalar type?");
25010 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25011 }
25012
25013 if (SSECC < 8 || Subtarget.hasAVX()) {
25014 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25015 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25016
25017 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25018 // instead of 3 logic instructions for size savings and potentially speed.
25019 // Unfortunately, there is no scalar form of VBLENDV.
25020 //
25021 // If either operand is a +0.0 constant, don't try this. We can expect to
25022 // optimize away at least one of the logic instructions later in that
25023 // case, so that sequence would be faster than a variable blend.
25024 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25025 !isNullFPConstant(Op2)) {
25026 // Convert to vectors, do a VSELECT, and convert back to scalar.
25027 // All of the conversions should be optimized away.
25028 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25029 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25030 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25031 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25032
25033 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25034 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25035
25036 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25037
25038 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25039 DAG.getVectorIdxConstant(0, DL));
25040 }
25041 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25042 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25043 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25044 }
25045 }
25046
25047 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25048 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25049 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25050 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25051 }
25052
25053 if (Cond.getOpcode() == ISD::SETCC &&
25054 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25055 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25056 Cond = NewCond;
25057 // If the condition was updated, it's possible that the operands of the
25058 // select were also updated (for example, EmitTest has a RAUW). Refresh
25059 // the local references to the select operands in case they got stale.
25060 Op1 = Op.getOperand(1);
25061 Op2 = Op.getOperand(2);
25062 }
25063 }
25064
25065 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25066 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25067 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25068 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25069 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25070 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25071 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25072 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25073 if (Cond.getOpcode() == X86ISD::SETCC &&
25074 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25075 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25076 SDValue Cmp = Cond.getOperand(1);
25077 SDValue CmpOp0 = Cmp.getOperand(0);
25078 unsigned CondCode = Cond.getConstantOperandVal(0);
25079
25080 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25081 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25082 // handle to keep the CMP with 0. This should be removed by
25083 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25084 // cttz_zero_undef.
25085 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25086 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25087 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25088 };
25089 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25090 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25091 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25092 // Keep Cmp.
25093 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25094 DL, DAG, Subtarget)) {
25095 return R;
25096 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
25097 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25098 ((CondCode == X86::COND_S) || // smin(x, 0)
25099 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25100 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25101 //
25102 // If the comparison is testing for a positive value, we have to invert
25103 // the sign bit mask, so only do that transform if the target has a
25104 // bitwise 'and not' instruction (the invert is free).
25105 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25106 unsigned ShCt = VT.getSizeInBits() - 1;
25107 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25108 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25109 if (CondCode == X86::COND_G)
25110 Shift = DAG.getNOT(DL, Shift, VT);
25111 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25112 }
25113 }
25114
25115 // Look past (and (setcc_carry (cmp ...)), 1).
25116 if (Cond.getOpcode() == ISD::AND &&
25117 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25118 isOneConstant(Cond.getOperand(1)))
25119 Cond = Cond.getOperand(0);
25120
25121 // Attempt to fold "raw cond" cases by treating them as:
25122 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25123 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25124 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25125 Subtarget))
25126 return R;
25127
25128 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25129 // setting operand in place of the X86ISD::SETCC.
25130 unsigned CondOpcode = Cond.getOpcode();
25131 if (CondOpcode == X86ISD::SETCC ||
25132 CondOpcode == X86ISD::SETCC_CARRY) {
25133 CC = Cond.getOperand(0);
25134
25135 SDValue Cmp = Cond.getOperand(1);
25136 bool IllegalFPCMov = false;
25137 if (VT.isFloatingPoint() && !VT.isVector() &&
25138 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25139 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25140
25141 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25142 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25143 Cond = Cmp;
25144 AddTest = false;
25145 }
25146 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25147 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25148 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25149 SDValue Value;
25150 X86::CondCode X86Cond;
25151 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25152
25153 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25154 AddTest = false;
25155 }
25156
25157 if (AddTest) {
25158 // Look past the truncate if the high bits are known zero.
25160 Cond = Cond.getOperand(0);
25161
25162 // We know the result of AND is compared against zero. Try to match
25163 // it to BT.
25164 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25165 X86::CondCode X86CondCode;
25166 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25167 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25168 Cond = BT;
25169 AddTest = false;
25170 }
25171 }
25172 }
25173
25174 if (AddTest) {
25175 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25176 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25177 }
25178
25179 // a < b ? -1 : 0 -> RES = ~setcc_carry
25180 // a < b ? 0 : -1 -> RES = setcc_carry
25181 // a >= b ? -1 : 0 -> RES = setcc_carry
25182 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25183 if (Cond.getOpcode() == X86ISD::SUB) {
25184 unsigned CondCode = CC->getAsZExtVal();
25185
25186 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25187 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25188 (isNullConstant(Op1) || isNullConstant(Op2))) {
25189 SDValue Res =
25190 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25191 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25192 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25193 return DAG.getNOT(DL, Res, Res.getValueType());
25194 return Res;
25195 }
25196 }
25197
25198 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25199 // widen the cmov and push the truncate through. This avoids introducing a new
25200 // branch during isel and doesn't add any extensions.
25201 if (Op.getValueType() == MVT::i8 &&
25202 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25203 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25204 if (T1.getValueType() == T2.getValueType() &&
25205 // Exclude CopyFromReg to avoid partial register stalls.
25206 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25207 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25208 CC, Cond);
25209 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25210 }
25211 }
25212
25213 // Or finally, promote i8 cmovs if we have CMOV,
25214 // or i16 cmovs if it won't prevent folding a load.
25215 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25216 // legal, but EmitLoweredSelect() can not deal with these extensions
25217 // being inserted between two CMOV's. (in i16 case too TBN)
25218 // https://bugs.llvm.org/show_bug.cgi?id=40974
25219 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25220 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25221 !X86::mayFoldLoad(Op2, Subtarget))) {
25222 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25223 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25224 SDValue Ops[] = { Op2, Op1, CC, Cond };
25225 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25226 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25227 }
25228
25229 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25230 // condition is true.
25231 SDValue Ops[] = { Op2, Op1, CC, Cond };
25232 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25233}
25234
25236 const X86Subtarget &Subtarget,
25237 SelectionDAG &DAG) {
25238 MVT VT = Op->getSimpleValueType(0);
25239 SDValue In = Op->getOperand(0);
25240 MVT InVT = In.getSimpleValueType();
25241 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25242 MVT VTElt = VT.getVectorElementType();
25243 unsigned NumElts = VT.getVectorNumElements();
25244
25245 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25246 MVT ExtVT = VT;
25247 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25248 // If v16i32 is to be avoided, we'll need to split and concatenate.
25249 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25250 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25251
25252 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25253 }
25254
25255 // Widen to 512-bits if VLX is not supported.
25256 MVT WideVT = ExtVT;
25257 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25258 NumElts *= 512 / ExtVT.getSizeInBits();
25259 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25260 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25261 DAG.getVectorIdxConstant(0, dl));
25262 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25263 }
25264
25265 SDValue V;
25266 MVT WideEltVT = WideVT.getVectorElementType();
25267 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25268 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25269 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25270 } else {
25271 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25272 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25273 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25274 }
25275
25276 // Truncate if we had to extend i16/i8 above.
25277 if (VT != ExtVT) {
25278 WideVT = MVT::getVectorVT(VTElt, NumElts);
25279 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25280 }
25281
25282 // Extract back to 128/256-bit if we widened.
25283 if (WideVT != VT)
25284 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25285 DAG.getVectorIdxConstant(0, dl));
25286
25287 return V;
25288}
25289
25291 SelectionDAG &DAG) {
25292 SDValue In = Op->getOperand(0);
25293 MVT InVT = In.getSimpleValueType();
25294 SDLoc DL(Op);
25295
25296 if (InVT.getVectorElementType() == MVT::i1)
25297 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25298
25299 assert(Subtarget.hasAVX() && "Expected AVX support");
25300 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25301}
25302
25303// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25304// For sign extend this needs to handle all vector sizes and SSE4.1 and
25305// non-SSE4.1 targets. For zero extend this should only handle inputs of
25306// MVT::v64i8 when BWI is not supported, but AVX512 is.
25308 const X86Subtarget &Subtarget,
25309 SelectionDAG &DAG) {
25310 SDValue In = Op->getOperand(0);
25311 MVT VT = Op->getSimpleValueType(0);
25312 MVT InVT = In.getSimpleValueType();
25313
25314 MVT SVT = VT.getVectorElementType();
25315 MVT InSVT = InVT.getVectorElementType();
25317
25318 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25319 return SDValue();
25320 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25321 return SDValue();
25322 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25323 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25324 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25325 return SDValue();
25326
25327 SDLoc dl(Op);
25328 unsigned Opc = Op.getOpcode();
25329 unsigned NumElts = VT.getVectorNumElements();
25330
25331 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25332 // For 512-bit vectors, we need 128-bits or 256-bits.
25333 if (InVT.getSizeInBits() > 128) {
25334 // Input needs to be at least the same number of elements as output, and
25335 // at least 128-bits.
25336 int InSize = InSVT.getSizeInBits() * NumElts;
25337 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25338 InVT = In.getSimpleValueType();
25339 }
25340
25341 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25342 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25343 // need to be handled here for 256/512-bit results.
25344 if (Subtarget.hasInt256()) {
25345 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25346
25347 if (InVT.getVectorNumElements() != NumElts)
25348 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25349
25350 // FIXME: Apparently we create inreg operations that could be regular
25351 // extends.
25352 unsigned ExtOpc =
25355 return DAG.getNode(ExtOpc, dl, VT, In);
25356 }
25357
25358 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25359 if (Subtarget.hasAVX()) {
25360 assert(VT.is256BitVector() && "256-bit vector expected");
25361 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25362 int HalfNumElts = HalfVT.getVectorNumElements();
25363
25364 unsigned NumSrcElts = InVT.getVectorNumElements();
25365 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25366 for (int i = 0; i != HalfNumElts; ++i)
25367 HiMask[i] = HalfNumElts + i;
25368
25369 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25370 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25371 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25372 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25373 }
25374
25375 // We should only get here for sign extend.
25376 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25377 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25378 unsigned InNumElts = InVT.getVectorNumElements();
25379
25380 // If the source elements are already all-signbits, we don't need to extend,
25381 // just splat the elements.
25382 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25383 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25384 unsigned Scale = InNumElts / NumElts;
25385 SmallVector<int, 16> ShuffleMask;
25386 for (unsigned I = 0; I != NumElts; ++I)
25387 ShuffleMask.append(Scale, I);
25388 return DAG.getBitcast(VT,
25389 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25390 }
25391
25392 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25393 SDValue Curr = In;
25394 SDValue SignExt = Curr;
25395
25396 // As SRAI is only available on i16/i32 types, we expand only up to i32
25397 // and handle i64 separately.
25398 if (InVT != MVT::v4i32) {
25399 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25400
25401 unsigned DestWidth = DestVT.getScalarSizeInBits();
25402 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25403 unsigned DestElts = DestVT.getVectorNumElements();
25404
25405 // Build a shuffle mask that takes each input element and places it in the
25406 // MSBs of the new element size.
25407 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25408 for (unsigned i = 0; i != DestElts; ++i)
25409 Mask[i * Scale + (Scale - 1)] = i;
25410
25411 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25412 Curr = DAG.getBitcast(DestVT, Curr);
25413
25414 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25415 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25416 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25417 }
25418
25419 if (VT == MVT::v2i64) {
25420 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25421 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25422 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25423 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25424 SignExt = DAG.getBitcast(VT, SignExt);
25425 }
25426
25427 return SignExt;
25428}
25429
25431 SelectionDAG &DAG) {
25432 MVT VT = Op->getSimpleValueType(0);
25433 SDValue In = Op->getOperand(0);
25434 MVT InVT = In.getSimpleValueType();
25435 SDLoc dl(Op);
25436
25437 if (InVT.getVectorElementType() == MVT::i1)
25438 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25439
25440 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25442 "Expected same number of elements");
25443 assert((VT.getVectorElementType() == MVT::i16 ||
25444 VT.getVectorElementType() == MVT::i32 ||
25445 VT.getVectorElementType() == MVT::i64) &&
25446 "Unexpected element type");
25447 assert((InVT.getVectorElementType() == MVT::i8 ||
25448 InVT.getVectorElementType() == MVT::i16 ||
25449 InVT.getVectorElementType() == MVT::i32) &&
25450 "Unexpected element type");
25451
25452 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25453 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25454 return splitVectorIntUnary(Op, DAG, dl);
25455 }
25456
25457 if (Subtarget.hasInt256())
25458 return Op;
25459
25460 // Optimize vectors in AVX mode
25461 // Sign extend v8i16 to v8i32 and
25462 // v4i32 to v4i64
25463 //
25464 // Divide input vector into two parts
25465 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25466 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25467 // concat the vectors to original VT
25468 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25469 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25470
25471 unsigned NumElems = InVT.getVectorNumElements();
25472 SmallVector<int,8> ShufMask(NumElems, -1);
25473 for (unsigned i = 0; i != NumElems/2; ++i)
25474 ShufMask[i] = i + NumElems/2;
25475
25476 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25477 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25478
25479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25480}
25481
25482/// Change a vector store into a pair of half-size vector stores.
25484 SDValue StoredVal = Store->getValue();
25485 assert((StoredVal.getValueType().is256BitVector() ||
25486 StoredVal.getValueType().is512BitVector()) &&
25487 "Expecting 256/512-bit op");
25488
25489 // Splitting volatile memory ops is not allowed unless the operation was not
25490 // legal to begin with. Assume the input store is legal (this transform is
25491 // only used for targets with AVX). Note: It is possible that we have an
25492 // illegal type like v2i128, and so we could allow splitting a volatile store
25493 // in that case if that is important.
25494 if (!Store->isSimple())
25495 return SDValue();
25496
25497 SDLoc DL(Store);
25498 SDValue Value0, Value1;
25499 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25500 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25501 SDValue Ptr0 = Store->getBasePtr();
25502 SDValue Ptr1 =
25503 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25504 SDValue Ch0 =
25505 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25506 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25507 SDValue Ch1 =
25508 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25509 Store->getPointerInfo().getWithOffset(HalfOffset),
25510 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25511 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25512}
25513
25514/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25515/// type.
25517 SelectionDAG &DAG) {
25518 SDValue StoredVal = Store->getValue();
25519 assert(StoreVT.is128BitVector() &&
25520 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25521 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25522
25523 // Splitting volatile memory ops is not allowed unless the operation was not
25524 // legal to begin with. We are assuming the input op is legal (this transform
25525 // is only used for targets with AVX).
25526 if (!Store->isSimple())
25527 return SDValue();
25528
25529 MVT StoreSVT = StoreVT.getScalarType();
25530 unsigned NumElems = StoreVT.getVectorNumElements();
25531 unsigned ScalarSize = StoreSVT.getStoreSize();
25532
25533 SDLoc DL(Store);
25535 for (unsigned i = 0; i != NumElems; ++i) {
25536 unsigned Offset = i * ScalarSize;
25537 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25539 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25540 DAG.getVectorIdxConstant(i, DL));
25541 SDValue Ch =
25542 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25543 Store->getPointerInfo().getWithOffset(Offset),
25544 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25545 Stores.push_back(Ch);
25546 }
25547 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25548}
25549
25550static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25551 SelectionDAG &DAG) {
25552 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25553 SDLoc dl(St);
25554 SDValue StoredVal = St->getValue();
25555
25556 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25557 if (StoredVal.getValueType().isVector() &&
25558 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25559 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25560 assert(NumElts <= 8 && "Unexpected VT");
25561 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25562 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25563 "Expected AVX512F without AVX512DQI");
25564
25565 // We must pad with zeros to ensure we store zeroes to any unused bits.
25566 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25567 DAG.getUNDEF(MVT::v16i1), StoredVal,
25568 DAG.getVectorIdxConstant(0, dl));
25569 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25570 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25571 // Make sure we store zeros in the extra bits.
25572 if (NumElts < 8)
25573 StoredVal = DAG.getZeroExtendInReg(
25574 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25575
25576 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25577 St->getPointerInfo(), St->getBaseAlign(),
25578 St->getMemOperand()->getFlags());
25579 }
25580
25581 if (St->isTruncatingStore())
25582 return SDValue();
25583
25584 // If this is a 256/512-bit store of concatenated ops, we are better off
25585 // splitting that store into two half-size stores. This avoids spurious use of
25586 // concatenated ops and each half can execute independently. Some cores would
25587 // split the op into halves anyway, so the concat is purely an extra op.
25588 MVT StoreVT = StoredVal.getSimpleValueType();
25589 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25590 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25591 return splitVectorStore(St, DAG);
25592 return SDValue();
25593 }
25594
25595 if (StoreVT.is32BitVector())
25596 return SDValue();
25597
25598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25599 assert(StoreVT.is64BitVector() && "Unexpected VT");
25600 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25602 "Unexpected type action!");
25603
25604 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25605 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25606 DAG.getUNDEF(StoreVT));
25607
25608 if (Subtarget.hasSSE2()) {
25609 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25610 // and store it.
25611 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25612 MVT CastVT = MVT::getVectorVT(StVT, 2);
25613 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25614 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25615 DAG.getVectorIdxConstant(0, dl));
25616
25617 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25618 St->getPointerInfo(), St->getBaseAlign(),
25619 St->getMemOperand()->getFlags());
25620 }
25621 assert(Subtarget.hasSSE1() && "Expected SSE");
25622 SDVTList Tys = DAG.getVTList(MVT::Other);
25623 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25624 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25625 St->getMemOperand());
25626}
25627
25628// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25629// may emit an illegal shuffle but the expansion is still better than scalar
25630// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25631// we'll emit a shuffle and a arithmetic shift.
25632// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25633// TODO: It is possible to support ZExt by zeroing the undef values during
25634// the shuffle phase or after the shuffle.
25635static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25636 SelectionDAG &DAG) {
25637 MVT RegVT = Op.getSimpleValueType();
25638 assert(RegVT.isVector() && "We only custom lower vector loads.");
25639 assert(RegVT.isInteger() &&
25640 "We only custom lower integer vector loads.");
25641
25642 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25643 SDLoc dl(Ld);
25644
25645 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25646 if (RegVT.getVectorElementType() == MVT::i1) {
25647 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25648 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25649 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25650 "Expected AVX512F without AVX512DQI");
25651
25652 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25653 Ld->getPointerInfo(), Ld->getBaseAlign(),
25654 Ld->getMemOperand()->getFlags());
25655
25656 // Replace chain users with the new chain.
25657 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25658
25659 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25660 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25661 DAG.getBitcast(MVT::v16i1, Val),
25662 DAG.getVectorIdxConstant(0, dl));
25663 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25664 }
25665
25666 return SDValue();
25667}
25668
25669/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25670/// each of which has no other use apart from the AND / OR.
25671static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25672 Opc = Op.getOpcode();
25673 if (Opc != ISD::OR && Opc != ISD::AND)
25674 return false;
25675 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25676 Op.getOperand(0).hasOneUse() &&
25677 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25678 Op.getOperand(1).hasOneUse());
25679}
25680
25681SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25682 SDValue Chain = Op.getOperand(0);
25683 SDValue Cond = Op.getOperand(1);
25684 SDValue Dest = Op.getOperand(2);
25685 SDLoc dl(Op);
25686
25687 // Bail out when we don't have native compare instructions.
25688 if (Cond.getOpcode() == ISD::SETCC &&
25689 Cond.getOperand(0).getValueType() != MVT::f128 &&
25690 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25691 SDValue LHS = Cond.getOperand(0);
25692 SDValue RHS = Cond.getOperand(1);
25693 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25694
25695 // Special case for
25696 // setcc([su]{add,sub,mul}o == 0)
25697 // setcc([su]{add,sub,mul}o != 1)
25699 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25701 SDValue Value, Overflow;
25702 X86::CondCode X86Cond;
25703 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25704
25705 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25706 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25707
25708 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25709 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25710 Overflow, Op->getFlags());
25711 }
25712
25713 if (LHS.getSimpleValueType().isInteger()) {
25714 SDValue CCVal;
25715 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25716 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25717 EFLAGS, Op->getFlags());
25718 }
25719
25720 if (CC == ISD::SETOEQ) {
25721 // For FCMP_OEQ, we can emit
25722 // two branches instead of an explicit AND instruction with a
25723 // separate test. However, we only do this if this block doesn't
25724 // have a fall-through edge, because this requires an explicit
25725 // jmp when the condition is false.
25726 if (Op.getNode()->hasOneUse()) {
25727 SDNode *User = *Op.getNode()->user_begin();
25728 // Look for an unconditional branch following this conditional branch.
25729 // We need this because we need to reverse the successors in order
25730 // to implement FCMP_OEQ.
25731 if (User->getOpcode() == ISD::BR) {
25732 SDValue FalseBB = User->getOperand(1);
25733 SDNode *NewBR =
25734 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25735 assert(NewBR == User);
25736 (void)NewBR;
25737 Dest = FalseBB;
25738
25739 SDValue Cmp =
25740 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25741 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25742 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25743 CCVal, Cmp, Op->getFlags());
25744 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25745 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25746 Cmp, Op->getFlags());
25747 }
25748 }
25749 } else if (CC == ISD::SETUNE) {
25750 // For FCMP_UNE, we can emit
25751 // two branches instead of an explicit OR instruction with a
25752 // separate test.
25753 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25754 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25755 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25756 Cmp, Op->getFlags());
25757 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25758 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25759 Cmp, Op->getFlags());
25760 } else {
25761 X86::CondCode X86Cond =
25762 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25763 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25764 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25765 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25766 Cmp, Op->getFlags());
25767 }
25768 }
25769
25771 SDValue Value, Overflow;
25772 X86::CondCode X86Cond;
25773 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25774
25775 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25776 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25777 Overflow, Op->getFlags());
25778 }
25779
25780 // Look past the truncate if the high bits are known zero.
25782 Cond = Cond.getOperand(0);
25783
25784 EVT CondVT = Cond.getValueType();
25785
25786 // Add an AND with 1 if we don't already have one.
25787 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25788 Cond =
25789 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25790
25791 SDValue LHS = Cond;
25792 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25793
25794 SDValue CCVal;
25795 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25796 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25797 Op->getFlags());
25798}
25799
25800// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25801// Calls to _alloca are needed to probe the stack when allocating more than 4k
25802// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25803// that the guard pages used by the OS virtual memory manager are allocated in
25804// correct sequence.
25805SDValue
25806X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25807 SelectionDAG &DAG) const {
25808 MachineFunction &MF = DAG.getMachineFunction();
25809 bool SplitStack = MF.shouldSplitStack();
25810 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25811 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25812 SplitStack || EmitStackProbeCall;
25813 SDLoc dl(Op);
25814
25815 // Get the inputs.
25816 SDNode *Node = Op.getNode();
25817 SDValue Chain = Op.getOperand(0);
25818 SDValue Size = Op.getOperand(1);
25819 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25820 EVT VT = Node->getValueType(0);
25821
25822 // Chain the dynamic stack allocation so that it doesn't modify the stack
25823 // pointer when other instructions are using the stack.
25824 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25825
25826 bool Is64Bit = Subtarget.is64Bit();
25827 MVT SPTy = Op.getValueType().getSimpleVT();
25828
25830 if (!Lower) {
25831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25833 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25834 " not tell us which reg is the stack pointer!");
25835
25836 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25837 const Align StackAlign = TFI.getStackAlign();
25838 if (hasInlineStackProbe(MF)) {
25839 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25840 {Chain, Size});
25841 Chain = Result.getValue(1);
25842 } else {
25843 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25844 Chain = SP.getValue(1);
25845 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25846 }
25847 if (Alignment && *Alignment > StackAlign)
25848 Result = DAG.getNode(
25849 ISD::AND, dl, VT, Result,
25850 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25851 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25852 } else if (SplitStack) {
25853 if (Is64Bit) {
25854 // The 64 bit implementation of segmented stacks needs to clobber both r10
25855 // r11. This makes it impossible to use it along with nested parameters.
25856 const Function &F = MF.getFunction();
25857 for (const auto &A : F.args()) {
25858 if (A.hasNestAttr())
25859 report_fatal_error("Cannot use segmented stacks with functions that "
25860 "have nested arguments.");
25861 }
25862 }
25863
25864 Result =
25865 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25866 Chain = Result.getValue(1);
25867 } else {
25868 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25869 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25870 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25871
25872 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25873 Register SPReg = RegInfo->getStackRegister();
25874 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25875 Chain = SP.getValue(1);
25876
25877 if (Alignment) {
25878 SP = DAG.getNode(
25879 ISD::AND, dl, VT, SP.getValue(0),
25880 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25881 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25882 }
25883
25884 Result = SP;
25885 }
25886
25887 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25888
25889 SDValue Ops[2] = {Result, Chain};
25890 return DAG.getMergeValues(Ops, dl);
25891}
25892
25893SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25894 MachineFunction &MF = DAG.getMachineFunction();
25895 SDValue Ptr = Op.getOperand(1);
25896 EVT PtrVT = Ptr.getValueType();
25897
25898 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25899
25900 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25901 SDLoc DL(Op);
25902
25903 if (!Subtarget.is64Bit() ||
25904 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25905 // vastart just stores the address of the VarArgsFrameIndex slot into the
25906 // memory location argument.
25907 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25908 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25909 }
25910
25911 // __va_list_tag:
25912 // gp_offset (0 - 6 * 8)
25913 // fp_offset (48 - 48 + 8 * 16)
25914 // overflow_arg_area (point to parameters coming in memory).
25915 // reg_save_area
25917 SDValue FIN = Op.getOperand(1);
25918 // Store gp_offset
25919 SDValue Store = DAG.getStore(
25920 Op.getOperand(0), DL,
25921 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25922 MachinePointerInfo(SV));
25923 MemOps.push_back(Store);
25924
25925 // Store fp_offset
25926 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25927 Store = DAG.getStore(
25928 Op.getOperand(0), DL,
25929 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25930 MachinePointerInfo(SV, 4));
25931 MemOps.push_back(Store);
25932
25933 // Store ptr to overflow_arg_area
25934 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25935 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25936 Store =
25937 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25938 MemOps.push_back(Store);
25939
25940 // Store ptr to reg_save_area.
25941 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25942 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25943 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25944 Store = DAG.getStore(
25945 Op.getOperand(0), DL, RSFIN, FIN,
25946 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25947 MemOps.push_back(Store);
25948 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25949}
25950
25951SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25952 assert(Subtarget.is64Bit() &&
25953 "LowerVAARG only handles 64-bit va_arg!");
25954 assert(Op.getNumOperands() == 4);
25955
25956 MachineFunction &MF = DAG.getMachineFunction();
25957 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25958 // The Win64 ABI uses char* instead of a structure.
25959 return DAG.expandVAArg(Op.getNode());
25960
25961 SDValue Chain = Op.getOperand(0);
25962 SDValue SrcPtr = Op.getOperand(1);
25963 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25964 unsigned Align = Op.getConstantOperandVal(3);
25965 SDLoc dl(Op);
25966
25967 EVT ArgVT = Op.getNode()->getValueType(0);
25968 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25969 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25970 uint8_t ArgMode;
25971
25972 // Decide which area this value should be read from.
25973 // TODO: Implement the AMD64 ABI in its entirety. This simple
25974 // selection mechanism works only for the basic types.
25975 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25976 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25977 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25978 } else {
25979 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25980 "Unhandled argument type in LowerVAARG");
25981 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25982 }
25983
25984 if (ArgMode == 2) {
25985 // Make sure using fp_offset makes sense.
25986 assert(!Subtarget.useSoftFloat() &&
25987 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25988 Subtarget.hasSSE1());
25989 }
25990
25991 // Insert VAARG node into the DAG
25992 // VAARG returns two values: Variable Argument Address, Chain
25993 SDValue InstOps[] = {Chain, SrcPtr,
25994 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25995 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25996 DAG.getTargetConstant(Align, dl, MVT::i32)};
25997 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
25998 SDValue VAARG = DAG.getMemIntrinsicNode(
25999 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26000 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26001 /*Alignment=*/std::nullopt,
26003 Chain = VAARG.getValue(1);
26004
26005 // Load the next argument and return it
26006 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26007}
26008
26009static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26010 SelectionDAG &DAG) {
26011 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26012 // where a va_list is still an i8*.
26013 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26014 if (Subtarget.isCallingConvWin64(
26016 // Probably a Win64 va_copy.
26017 return DAG.expandVACopy(Op.getNode());
26018
26019 SDValue Chain = Op.getOperand(0);
26020 SDValue DstPtr = Op.getOperand(1);
26021 SDValue SrcPtr = Op.getOperand(2);
26022 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26023 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26024 SDLoc DL(Op);
26025
26026 return DAG.getMemcpy(
26027 Chain, DL, DstPtr, SrcPtr,
26028 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26029 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26030 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26031 MachinePointerInfo(SrcSV));
26032}
26033
26034// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26035static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26036 switch (Opc) {
26037 case ISD::SHL:
26038 case X86ISD::VSHL:
26039 case X86ISD::VSHLI:
26040 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26041 case ISD::SRL:
26042 case X86ISD::VSRL:
26043 case X86ISD::VSRLI:
26044 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26045 case ISD::SRA:
26046 case X86ISD::VSRA:
26047 case X86ISD::VSRAI:
26048 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26049 }
26050 llvm_unreachable("Unknown target vector shift node");
26051}
26052
26053/// Handle vector element shifts where the shift amount is a constant.
26054/// Takes immediate version of shift as input.
26055static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26056 SDValue SrcOp, uint64_t ShiftAmt,
26057 SelectionDAG &DAG) {
26058 MVT ElementType = VT.getVectorElementType();
26059
26060 // Bitcast the source vector to the output type, this is mainly necessary for
26061 // vXi8/vXi64 shifts.
26062 if (VT != SrcOp.getSimpleValueType())
26063 SrcOp = DAG.getBitcast(VT, SrcOp);
26064
26065 // Fold this packed shift into its first operand if ShiftAmt is 0.
26066 if (ShiftAmt == 0)
26067 return SrcOp;
26068
26069 // Check for ShiftAmt >= element width
26070 if (ShiftAmt >= ElementType.getSizeInBits()) {
26071 if (Opc == X86ISD::VSRAI)
26072 ShiftAmt = ElementType.getSizeInBits() - 1;
26073 else
26074 return DAG.getConstant(0, dl, VT);
26075 }
26076
26078 && "Unknown target vector shift-by-constant node");
26079
26080 // Fold this packed vector shift into a build vector if SrcOp is a
26081 // vector of Constants or UNDEFs.
26083 unsigned ShiftOpc;
26084 switch (Opc) {
26085 default: llvm_unreachable("Unknown opcode!");
26086 case X86ISD::VSHLI:
26087 ShiftOpc = ISD::SHL;
26088 break;
26089 case X86ISD::VSRLI:
26090 ShiftOpc = ISD::SRL;
26091 break;
26092 case X86ISD::VSRAI:
26093 ShiftOpc = ISD::SRA;
26094 break;
26095 }
26096
26097 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26098 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26099 return C;
26100 }
26101
26102 return DAG.getNode(Opc, dl, VT, SrcOp,
26103 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26104}
26105
26106/// Handle vector element shifts by a splat shift amount
26107static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26108 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26109 const X86Subtarget &Subtarget,
26110 SelectionDAG &DAG) {
26111 MVT AmtVT = ShAmt.getSimpleValueType();
26112 assert(AmtVT.isVector() && "Vector shift type mismatch");
26113 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26114 "Illegal vector splat index");
26115
26116 // Move the splat element to the bottom element.
26117 if (ShAmtIdx != 0) {
26118 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26119 Mask[0] = ShAmtIdx;
26120 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26121 }
26122
26123 // Peek through any zext node if we can get back to a 128-bit source.
26124 if (AmtVT.getScalarSizeInBits() == 64 &&
26125 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26127 ShAmt.getOperand(0).getValueType().isSimple() &&
26128 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26129 ShAmt = ShAmt.getOperand(0);
26130 AmtVT = ShAmt.getSimpleValueType();
26131 }
26132
26133 // See if we can mask off the upper elements using the existing source node.
26134 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26135 // do this for vXi64 types.
26136 bool IsMasked = false;
26137 if (AmtVT.getScalarSizeInBits() < 64) {
26138 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26139 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26140 // If the shift amount has come from a scalar, then zero-extend the scalar
26141 // before moving to the vector.
26142 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26143 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26144 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26145 AmtVT = MVT::v4i32;
26146 IsMasked = true;
26147 } else if (ShAmt.getOpcode() == ISD::AND) {
26148 // See if the shift amount is already masked (e.g. for rotation modulo),
26149 // then we can zero-extend it by setting all the other mask elements to
26150 // zero.
26151 SmallVector<SDValue> MaskElts(
26152 AmtVT.getVectorNumElements(),
26153 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26154 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26155 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26156 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26157 {ShAmt.getOperand(1), Mask}))) {
26158 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26159 IsMasked = true;
26160 }
26161 }
26162 }
26163
26164 // Extract if the shift amount vector is larger than 128-bits.
26165 if (AmtVT.getSizeInBits() > 128) {
26166 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26167 AmtVT = ShAmt.getSimpleValueType();
26168 }
26169
26170 // Zero-extend bottom element to v2i64 vector type, either by extension or
26171 // shuffle masking.
26172 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26173 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26174 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26175 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26176 } else if (Subtarget.hasSSE41()) {
26177 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26178 MVT::v2i64, ShAmt);
26179 } else {
26180 SDValue ByteShift = DAG.getTargetConstant(
26181 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26182 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26183 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26184 ByteShift);
26185 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26186 ByteShift);
26187 }
26188 }
26189
26190 // Change opcode to non-immediate version.
26192
26193 // The return type has to be a 128-bit type with the same element
26194 // type as the input type.
26195 MVT EltVT = VT.getVectorElementType();
26196 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26197
26198 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26199 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26200}
26201
26202/// Return Mask with the necessary casting or extending
26203/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26204static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26205 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26206 const SDLoc &dl) {
26207
26208 if (isAllOnesConstant(Mask))
26209 return DAG.getConstant(1, dl, MaskVT);
26210 if (X86::isZeroNode(Mask))
26211 return DAG.getConstant(0, dl, MaskVT);
26212
26213 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26214
26215 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26216 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26217 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26218 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26219 SDValue Lo, Hi;
26220 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26221 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26222 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26223 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26224 } else {
26225 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26226 Mask.getSimpleValueType().getSizeInBits());
26227 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26228 // are extracted by EXTRACT_SUBVECTOR.
26229 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26230 DAG.getBitcast(BitcastVT, Mask),
26231 DAG.getVectorIdxConstant(0, dl));
26232 }
26233}
26234
26235/// Return (and \p Op, \p Mask) for compare instructions or
26236/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26237/// necessary casting or extending for \p Mask when lowering masking intrinsics
26239 SDValue PreservedSrc,
26240 const X86Subtarget &Subtarget,
26241 SelectionDAG &DAG) {
26242 MVT VT = Op.getSimpleValueType();
26243 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26244 unsigned OpcodeSelect = ISD::VSELECT;
26245 SDLoc dl(Op);
26246
26247 if (isAllOnesConstant(Mask))
26248 return Op;
26249
26250 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26251
26252 if (PreservedSrc.isUndef())
26253 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26254 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26255}
26256
26257/// Creates an SDNode for a predicated scalar operation.
26258/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26259/// The mask is coming as MVT::i8 and it should be transformed
26260/// to MVT::v1i1 while lowering masking intrinsics.
26261/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26262/// "X86select" instead of "vselect". We just can't create the "vselect" node
26263/// for a scalar instruction.
26265 SDValue PreservedSrc,
26266 const X86Subtarget &Subtarget,
26267 SelectionDAG &DAG) {
26268 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26269 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26270 return Op;
26271
26272 MVT VT = Op.getSimpleValueType();
26273 SDLoc dl(Op);
26274
26275 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26276 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26277 DAG.getBitcast(MVT::v8i1, Mask),
26278 DAG.getVectorIdxConstant(0, dl));
26279 if (Op.getOpcode() == X86ISD::FSETCCM ||
26280 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26281 Op.getOpcode() == X86ISD::VFPCLASSS)
26282 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26283
26284 if (PreservedSrc.isUndef())
26285 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26286
26287 if (MaskConst) {
26288 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26289 // Discard op and blend passthrough with scalar op src/dst.
26291 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26292 ShuffleMask[0] = VT.getVectorNumElements();
26293 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26294 ShuffleMask);
26295 }
26296
26297 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26298}
26299
26301 if (!Fn->hasPersonalityFn())
26303 "querying registration node size for function without personality");
26304 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26305 // WinEHStatePass for the full struct definition.
26306 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26307 case EHPersonality::MSVC_X86SEH: return 24;
26308 case EHPersonality::MSVC_CXX: return 16;
26309 default: break;
26310 }
26312 "can only recover FP for 32-bit MSVC EH personality functions");
26313}
26314
26315/// When the MSVC runtime transfers control to us, either to an outlined
26316/// function or when returning to a parent frame after catching an exception, we
26317/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26318/// Here's the math:
26319/// RegNodeBase = EntryEBP - RegNodeSize
26320/// ParentFP = RegNodeBase - ParentFrameOffset
26321/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26322/// subtracting the offset (negative on x86) takes us back to the parent FP.
26324 SDValue EntryEBP) {
26326 SDLoc dl;
26327
26328 // It's possible that the parent function no longer has a personality function
26329 // if the exceptional code was optimized away, in which case we just return
26330 // the incoming EBP.
26331 if (!Fn->hasPersonalityFn())
26332 return EntryEBP;
26333
26334 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26335 // registration, or the .set_setframe offset.
26338 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26339 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26340 SDValue ParentFrameOffset =
26341 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26342
26343 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26344 // prologue to RBP in the parent function.
26345 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26346 if (Subtarget.is64Bit())
26347 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26348
26349 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26350 // RegNodeBase = EntryEBP - RegNodeSize
26351 // ParentFP = RegNodeBase - ParentFrameOffset
26352 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26353 DAG.getConstant(RegNodeSize, dl, PtrVT));
26354 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26355}
26356
26357SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26358 SelectionDAG &DAG) const {
26359 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26360 auto isRoundModeCurDirection = [](SDValue Rnd) {
26361 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26362 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26363
26364 return false;
26365 };
26366 auto isRoundModeSAE = [](SDValue Rnd) {
26367 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26368 unsigned RC = C->getZExtValue();
26370 // Clear the NO_EXC bit and check remaining bits.
26372 // As a convenience we allow no other bits or explicitly
26373 // current direction.
26374 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26375 }
26376 }
26377
26378 return false;
26379 };
26380 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26381 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26382 RC = C->getZExtValue();
26384 // Clear the NO_EXC bit and check remaining bits.
26390 }
26391 }
26392
26393 return false;
26394 };
26395
26396 SDLoc dl(Op);
26397 unsigned IntNo = Op.getConstantOperandVal(0);
26398 MVT VT = Op.getSimpleValueType();
26399 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26400
26401 // Propagate flags from original node to transformed node(s).
26402 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26403
26404 if (IntrData) {
26405 switch(IntrData->Type) {
26406 case INTR_TYPE_1OP: {
26407 // We specify 2 possible opcodes for intrinsics with rounding modes.
26408 // First, we check if the intrinsic may have non-default rounding mode,
26409 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26410 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26411 if (IntrWithRoundingModeOpcode != 0) {
26412 SDValue Rnd = Op.getOperand(2);
26413 unsigned RC = 0;
26414 if (isRoundModeSAEToX(Rnd, RC))
26415 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26416 Op.getOperand(1),
26417 DAG.getTargetConstant(RC, dl, MVT::i32));
26418 if (!isRoundModeCurDirection(Rnd))
26419 return SDValue();
26420 }
26421 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26422 Op.getOperand(1));
26423 }
26424 case INTR_TYPE_1OP_SAE: {
26425 SDValue Sae = Op.getOperand(2);
26426
26427 unsigned Opc;
26428 if (isRoundModeCurDirection(Sae))
26429 Opc = IntrData->Opc0;
26430 else if (isRoundModeSAE(Sae))
26431 Opc = IntrData->Opc1;
26432 else
26433 return SDValue();
26434
26435 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26436 }
26437 case INTR_TYPE_2OP: {
26438 SDValue Src2 = Op.getOperand(2);
26439
26440 // We specify 2 possible opcodes for intrinsics with rounding modes.
26441 // First, we check if the intrinsic may have non-default rounding mode,
26442 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26443 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26444 if (IntrWithRoundingModeOpcode != 0) {
26445 SDValue Rnd = Op.getOperand(3);
26446 unsigned RC = 0;
26447 if (isRoundModeSAEToX(Rnd, RC))
26448 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26449 Op.getOperand(1), Src2,
26450 DAG.getTargetConstant(RC, dl, MVT::i32));
26451 if (!isRoundModeCurDirection(Rnd))
26452 return SDValue();
26453 }
26454
26455 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26456 Op.getOperand(1), Src2);
26457 }
26458 case INTR_TYPE_2OP_SAE: {
26459 SDValue Sae = Op.getOperand(3);
26460
26461 unsigned Opc;
26462 if (isRoundModeCurDirection(Sae))
26463 Opc = IntrData->Opc0;
26464 else if (isRoundModeSAE(Sae))
26465 Opc = IntrData->Opc1;
26466 else
26467 return SDValue();
26468
26469 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26470 Op.getOperand(2));
26471 }
26472 case INTR_TYPE_3OP:
26473 case INTR_TYPE_3OP_IMM8: {
26474 SDValue Src1 = Op.getOperand(1);
26475 SDValue Src2 = Op.getOperand(2);
26476 SDValue Src3 = Op.getOperand(3);
26477
26478 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26479 Src3.getValueType() != MVT::i8) {
26480 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26481 }
26482
26483 // We specify 2 possible opcodes for intrinsics with rounding modes.
26484 // First, we check if the intrinsic may have non-default rounding mode,
26485 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26486 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26487 if (IntrWithRoundingModeOpcode != 0) {
26488 SDValue Rnd = Op.getOperand(4);
26489 unsigned RC = 0;
26490 if (isRoundModeSAEToX(Rnd, RC))
26491 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26492 Src1, Src2, Src3,
26493 DAG.getTargetConstant(RC, dl, MVT::i32));
26494 if (!isRoundModeCurDirection(Rnd))
26495 return SDValue();
26496 }
26497
26498 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26499 {Src1, Src2, Src3});
26500 }
26501 case INTR_TYPE_4OP_IMM8: {
26502 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26503 SDValue Src4 = Op.getOperand(4);
26504 if (Src4.getValueType() != MVT::i8) {
26505 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26506 }
26507
26508 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26509 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26510 Src4);
26511 }
26512 case INTR_TYPE_1OP_MASK: {
26513 SDValue Src = Op.getOperand(1);
26514 SDValue PassThru = Op.getOperand(2);
26515 SDValue Mask = Op.getOperand(3);
26516 // We add rounding mode to the Node when
26517 // - RC Opcode is specified and
26518 // - RC is not "current direction".
26519 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26520 if (IntrWithRoundingModeOpcode != 0) {
26521 SDValue Rnd = Op.getOperand(4);
26522 unsigned RC = 0;
26523 if (isRoundModeSAEToX(Rnd, RC))
26524 return getVectorMaskingNode(
26525 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26526 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26527 Mask, PassThru, Subtarget, DAG);
26528 if (!isRoundModeCurDirection(Rnd))
26529 return SDValue();
26530 }
26531 return getVectorMaskingNode(
26532 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26533 Subtarget, DAG);
26534 }
26536 SDValue Src = Op.getOperand(1);
26537 SDValue PassThru = Op.getOperand(2);
26538 SDValue Mask = Op.getOperand(3);
26539 SDValue Rnd = Op.getOperand(4);
26540
26541 unsigned Opc;
26542 if (isRoundModeCurDirection(Rnd))
26543 Opc = IntrData->Opc0;
26544 else if (isRoundModeSAE(Rnd))
26545 Opc = IntrData->Opc1;
26546 else
26547 return SDValue();
26548
26549 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26550 Subtarget, DAG);
26551 }
26552 case INTR_TYPE_SCALAR_MASK: {
26553 SDValue Src1 = Op.getOperand(1);
26554 SDValue Src2 = Op.getOperand(2);
26555 SDValue passThru = Op.getOperand(3);
26556 SDValue Mask = Op.getOperand(4);
26557 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26558 // There are 2 kinds of intrinsics in this group:
26559 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26560 // (2) With rounding mode and sae - 7 operands.
26561 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26562 if (Op.getNumOperands() == (5U + HasRounding)) {
26563 if (HasRounding) {
26564 SDValue Rnd = Op.getOperand(5);
26565 unsigned RC = 0;
26566 if (isRoundModeSAEToX(Rnd, RC))
26567 return getScalarMaskingNode(
26568 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26569 DAG.getTargetConstant(RC, dl, MVT::i32)),
26570 Mask, passThru, Subtarget, DAG);
26571 if (!isRoundModeCurDirection(Rnd))
26572 return SDValue();
26573 }
26574 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26575 Src2),
26576 Mask, passThru, Subtarget, DAG);
26577 }
26578
26579 assert(Op.getNumOperands() == (6U + HasRounding) &&
26580 "Unexpected intrinsic form");
26581 SDValue RoundingMode = Op.getOperand(5);
26582 unsigned Opc = IntrData->Opc0;
26583 if (HasRounding) {
26584 SDValue Sae = Op.getOperand(6);
26585 if (isRoundModeSAE(Sae))
26586 Opc = IntrWithRoundingModeOpcode;
26587 else if (!isRoundModeCurDirection(Sae))
26588 return SDValue();
26589 }
26590 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26591 Src2, RoundingMode),
26592 Mask, passThru, Subtarget, DAG);
26593 }
26595 SDValue Src1 = Op.getOperand(1);
26596 SDValue Src2 = Op.getOperand(2);
26597 SDValue passThru = Op.getOperand(3);
26598 SDValue Mask = Op.getOperand(4);
26599 SDValue Rnd = Op.getOperand(5);
26600
26601 SDValue NewOp;
26602 unsigned RC = 0;
26603 if (isRoundModeCurDirection(Rnd))
26604 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26605 else if (isRoundModeSAEToX(Rnd, RC))
26606 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26607 DAG.getTargetConstant(RC, dl, MVT::i32));
26608 else
26609 return SDValue();
26610
26611 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26612 }
26614 SDValue Src1 = Op.getOperand(1);
26615 SDValue Src2 = Op.getOperand(2);
26616 SDValue passThru = Op.getOperand(3);
26617 SDValue Mask = Op.getOperand(4);
26618 SDValue Sae = Op.getOperand(5);
26619 unsigned Opc;
26620 if (isRoundModeCurDirection(Sae))
26621 Opc = IntrData->Opc0;
26622 else if (isRoundModeSAE(Sae))
26623 Opc = IntrData->Opc1;
26624 else
26625 return SDValue();
26626
26627 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26628 Mask, passThru, Subtarget, DAG);
26629 }
26630 case INTR_TYPE_2OP_MASK: {
26631 SDValue Src1 = Op.getOperand(1);
26632 SDValue Src2 = Op.getOperand(2);
26633 SDValue PassThru = Op.getOperand(3);
26634 SDValue Mask = Op.getOperand(4);
26635 SDValue NewOp;
26636 if (IntrData->Opc1 != 0) {
26637 SDValue Rnd = Op.getOperand(5);
26638 unsigned RC = 0;
26639 if (isRoundModeSAEToX(Rnd, RC))
26640 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26641 DAG.getTargetConstant(RC, dl, MVT::i32));
26642 else if (!isRoundModeCurDirection(Rnd))
26643 return SDValue();
26644 }
26645 if (!NewOp)
26646 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26647 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26648 }
26650 SDValue Src1 = Op.getOperand(1);
26651 SDValue Src2 = Op.getOperand(2);
26652 SDValue PassThru = Op.getOperand(3);
26653 SDValue Mask = Op.getOperand(4);
26654
26655 unsigned Opc = IntrData->Opc0;
26656 if (IntrData->Opc1 != 0) {
26657 SDValue Sae = Op.getOperand(5);
26658 if (isRoundModeSAE(Sae))
26659 Opc = IntrData->Opc1;
26660 else if (!isRoundModeCurDirection(Sae))
26661 return SDValue();
26662 }
26663
26664 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26665 Mask, PassThru, Subtarget, DAG);
26666 }
26668 SDValue Src1 = Op.getOperand(1);
26669 SDValue Src2 = Op.getOperand(2);
26670 SDValue Src3 = Op.getOperand(3);
26671 SDValue PassThru = Op.getOperand(4);
26672 SDValue Mask = Op.getOperand(5);
26673 SDValue Sae = Op.getOperand(6);
26674 unsigned Opc;
26675 if (isRoundModeCurDirection(Sae))
26676 Opc = IntrData->Opc0;
26677 else if (isRoundModeSAE(Sae))
26678 Opc = IntrData->Opc1;
26679 else
26680 return SDValue();
26681
26682 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26683 Mask, PassThru, Subtarget, DAG);
26684 }
26686 SDValue Src1 = Op.getOperand(1);
26687 SDValue Src2 = Op.getOperand(2);
26688 SDValue Src3 = Op.getOperand(3);
26689 SDValue PassThru = Op.getOperand(4);
26690 SDValue Mask = Op.getOperand(5);
26691
26692 unsigned Opc = IntrData->Opc0;
26693 if (IntrData->Opc1 != 0) {
26694 SDValue Sae = Op.getOperand(6);
26695 if (isRoundModeSAE(Sae))
26696 Opc = IntrData->Opc1;
26697 else if (!isRoundModeCurDirection(Sae))
26698 return SDValue();
26699 }
26700 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26701 Mask, PassThru, Subtarget, DAG);
26702 }
26703 case BLENDV: {
26704 SDValue Src1 = Op.getOperand(1);
26705 SDValue Src2 = Op.getOperand(2);
26706 SDValue Src3 = Op.getOperand(3);
26707
26708 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26709 Src3 = DAG.getBitcast(MaskVT, Src3);
26710
26711 // Reverse the operands to match VSELECT order.
26712 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26713 }
26714 case VPERM_2OP : {
26715 SDValue Src1 = Op.getOperand(1);
26716 SDValue Src2 = Op.getOperand(2);
26717
26718 // Swap Src1 and Src2 in the node creation
26719 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26720 }
26721 case CFMA_OP_MASKZ:
26722 case CFMA_OP_MASK: {
26723 SDValue Src1 = Op.getOperand(1);
26724 SDValue Src2 = Op.getOperand(2);
26725 SDValue Src3 = Op.getOperand(3);
26726 SDValue Mask = Op.getOperand(4);
26727 MVT VT = Op.getSimpleValueType();
26728
26729 SDValue PassThru = Src3;
26730 if (IntrData->Type == CFMA_OP_MASKZ)
26731 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26732
26733 // We add rounding mode to the Node when
26734 // - RC Opcode is specified and
26735 // - RC is not "current direction".
26736 SDValue NewOp;
26737 if (IntrData->Opc1 != 0) {
26738 SDValue Rnd = Op.getOperand(5);
26739 unsigned RC = 0;
26740 if (isRoundModeSAEToX(Rnd, RC))
26741 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26742 DAG.getTargetConstant(RC, dl, MVT::i32));
26743 else if (!isRoundModeCurDirection(Rnd))
26744 return SDValue();
26745 }
26746 if (!NewOp)
26747 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26748 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26749 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26750 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26751 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26752 }
26753 case IFMA_OP:
26754 // NOTE: We need to swizzle the operands to pass the multiply operands
26755 // first.
26756 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26758 case FPCLASSS: {
26759 SDValue Src1 = Op.getOperand(1);
26760 SDValue Imm = Op.getOperand(2);
26761 SDValue Mask = Op.getOperand(3);
26762 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26763 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26764 Subtarget, DAG);
26765 // Need to fill with zeros to ensure the bitcast will produce zeroes
26766 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26767 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26768 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26769 DAG.getVectorIdxConstant(0, dl));
26770 return DAG.getBitcast(MVT::i8, Ins);
26771 }
26772
26773 case CMP_MASK_CC: {
26774 MVT MaskVT = Op.getSimpleValueType();
26775 SDValue CC = Op.getOperand(3);
26776 SDValue Mask = Op.getOperand(4);
26777 // We specify 2 possible opcodes for intrinsics with rounding modes.
26778 // First, we check if the intrinsic may have non-default rounding mode,
26779 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26780 if (IntrData->Opc1 != 0) {
26781 SDValue Sae = Op.getOperand(5);
26782 if (isRoundModeSAE(Sae))
26783 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26784 Op.getOperand(2), CC, Mask, Sae);
26785 if (!isRoundModeCurDirection(Sae))
26786 return SDValue();
26787 }
26788 //default rounding mode
26789 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26790 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26791 }
26792 case CMP_MASK_SCALAR_CC: {
26793 SDValue Src1 = Op.getOperand(1);
26794 SDValue Src2 = Op.getOperand(2);
26795 SDValue CC = Op.getOperand(3);
26796 SDValue Mask = Op.getOperand(4);
26797
26798 SDValue Cmp;
26799 if (IntrData->Opc1 != 0) {
26800 SDValue Sae = Op.getOperand(5);
26801 if (isRoundModeSAE(Sae))
26802 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26803 else if (!isRoundModeCurDirection(Sae))
26804 return SDValue();
26805 }
26806 //default rounding mode
26807 if (!Cmp.getNode())
26808 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26809
26810 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26811 Subtarget, DAG);
26812 // Need to fill with zeros to ensure the bitcast will produce zeroes
26813 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26814 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26815 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26816 DAG.getVectorIdxConstant(0, dl));
26817 return DAG.getBitcast(MVT::i8, Ins);
26818 }
26819 case COMI: { // Comparison intrinsics
26820 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26821 SDValue LHS = Op.getOperand(1);
26822 SDValue RHS = Op.getOperand(2);
26823 // Some conditions require the operands to be swapped.
26824 if (CC == ISD::SETLT || CC == ISD::SETLE)
26825 std::swap(LHS, RHS);
26826
26827 // For AVX10.2, Support EQ and NE.
26828 bool HasAVX10_2_COMX =
26829 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26830
26831 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26832 // For BF type we need to fall back.
26833 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26834
26835 auto ComiOpCode = IntrData->Opc0;
26836 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26837
26838 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26839 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26840
26841 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26842
26843 SDValue SetCC;
26844 switch (CC) {
26845 case ISD::SETEQ: {
26846 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26847 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26848 break;
26849 // (ZF = 1 and PF = 0)
26850 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26851 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26852 break;
26853 }
26854 case ISD::SETNE: {
26855 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26856 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26857 break;
26858 // (ZF = 0 or PF = 1)
26859 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26860 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26861 break;
26862 }
26863 case ISD::SETGT: // (CF = 0 and ZF = 0)
26864 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26865 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26866 break;
26867 }
26868 case ISD::SETGE: // CF = 0
26869 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26870 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26871 break;
26872 default:
26873 llvm_unreachable("Unexpected illegal condition!");
26874 }
26875 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26876 }
26877 case COMI_RM: { // Comparison intrinsics with Sae
26878 SDValue LHS = Op.getOperand(1);
26879 SDValue RHS = Op.getOperand(2);
26880 unsigned CondVal = Op.getConstantOperandVal(3);
26881 SDValue Sae = Op.getOperand(4);
26882
26883 SDValue FCmp;
26884 if (isRoundModeCurDirection(Sae))
26885 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26886 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26887 else if (isRoundModeSAE(Sae))
26888 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26889 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26890 else
26891 return SDValue();
26892 // Need to fill with zeros to ensure the bitcast will produce zeroes
26893 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26894 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26895 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26896 DAG.getVectorIdxConstant(0, dl));
26897 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26898 DAG.getBitcast(MVT::i16, Ins));
26899 }
26900 case VSHIFT: {
26901 SDValue SrcOp = Op.getOperand(1);
26902 SDValue ShAmt = Op.getOperand(2);
26903 assert(ShAmt.getValueType() == MVT::i32 &&
26904 "Unexpected VSHIFT amount type");
26905
26906 // Catch shift-by-constant.
26907 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26908 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26909 Op.getSimpleValueType(), SrcOp,
26910 CShAmt->getZExtValue(), DAG);
26911
26912 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26913 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26914 SrcOp, ShAmt, 0, Subtarget, DAG);
26915 }
26917 SDValue Mask = Op.getOperand(3);
26918 SDValue DataToCompress = Op.getOperand(1);
26919 SDValue PassThru = Op.getOperand(2);
26920 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26921 return Op.getOperand(1);
26922
26923 // Avoid false dependency.
26924 if (PassThru.isUndef())
26925 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26926
26927 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26928 Mask);
26929 }
26930 case FIXUPIMM:
26931 case FIXUPIMM_MASKZ: {
26932 SDValue Src1 = Op.getOperand(1);
26933 SDValue Src2 = Op.getOperand(2);
26934 SDValue Src3 = Op.getOperand(3);
26935 SDValue Imm = Op.getOperand(4);
26936 SDValue Mask = Op.getOperand(5);
26937 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26938 ? Src1
26939 : getZeroVector(VT, Subtarget, DAG, dl);
26940
26941 unsigned Opc = IntrData->Opc0;
26942 if (IntrData->Opc1 != 0) {
26943 SDValue Sae = Op.getOperand(6);
26944 if (isRoundModeSAE(Sae))
26945 Opc = IntrData->Opc1;
26946 else if (!isRoundModeCurDirection(Sae))
26947 return SDValue();
26948 }
26949
26950 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26951
26953 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26954
26955 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26956 }
26957 case ROUNDP: {
26958 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26959 // Clear the upper bits of the rounding immediate so that the legacy
26960 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26961 uint64_t Round = Op.getConstantOperandVal(2);
26962 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26963 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26964 Op.getOperand(1), RoundingMode);
26965 }
26966 case ROUNDS: {
26967 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26968 // Clear the upper bits of the rounding immediate so that the legacy
26969 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26970 uint64_t Round = Op.getConstantOperandVal(3);
26971 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26972 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26973 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26974 }
26975 case BEXTRI: {
26976 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26977
26978 uint64_t Imm = Op.getConstantOperandVal(2);
26979 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26980 Op.getValueType());
26981 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26982 Op.getOperand(1), Control);
26983 }
26984 // ADC/SBB
26985 case ADX: {
26986 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26987 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26988
26989 SDValue Res;
26990 // If the carry in is zero, then we should just use ADD/SUB instead of
26991 // ADC/SBB.
26992 if (isNullConstant(Op.getOperand(1))) {
26993 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26994 Op.getOperand(3));
26995 } else {
26996 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26997 DAG.getAllOnesConstant(dl, MVT::i8));
26998 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26999 Op.getOperand(3), GenCF.getValue(1));
27000 }
27001 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27002 SDValue Results[] = { SetCC, Res };
27003 return DAG.getMergeValues(Results, dl);
27004 }
27005 case CVTPD2PS_MASK:
27006 case CVTPD2DQ_MASK:
27007 case CVTQQ2PS_MASK:
27008 case TRUNCATE_TO_REG: {
27009 SDValue Src = Op.getOperand(1);
27010 SDValue PassThru = Op.getOperand(2);
27011 SDValue Mask = Op.getOperand(3);
27012
27013 if (isAllOnesConstant(Mask))
27014 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27015
27016 MVT SrcVT = Src.getSimpleValueType();
27017 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27018 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27019 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27020 {Src, PassThru, Mask});
27021 }
27022 case TRUNCATE2_TO_REG: {
27023 SDValue Src = Op.getOperand(1);
27024 SDValue Src2 = Op.getOperand(2);
27025 SDValue PassThru = Op.getOperand(3);
27026 SDValue Mask = Op.getOperand(4);
27027
27028 if (isAllOnesConstant(Mask))
27029 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27030
27031 MVT Src2VT = Src2.getSimpleValueType();
27032 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27033 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27034 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27035 {Src, Src2, PassThru, Mask});
27036 }
27037 case CVTPS2PH_MASK: {
27038 SDValue Src = Op.getOperand(1);
27039 SDValue Rnd = Op.getOperand(2);
27040 SDValue PassThru = Op.getOperand(3);
27041 SDValue Mask = Op.getOperand(4);
27042
27043 unsigned RC = 0;
27044 unsigned Opc = IntrData->Opc0;
27045 bool SAE = Src.getValueType().is512BitVector() &&
27046 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27047 if (SAE) {
27049 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27050 }
27051
27052 if (isAllOnesConstant(Mask))
27053 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27054
27055 if (SAE)
27057 else
27058 Opc = IntrData->Opc1;
27059 MVT SrcVT = Src.getSimpleValueType();
27060 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27061 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27062 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27063 }
27064 case CVTNEPS2BF16_MASK: {
27065 SDValue Src = Op.getOperand(1);
27066 SDValue PassThru = Op.getOperand(2);
27067 SDValue Mask = Op.getOperand(3);
27068
27069 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27070 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27071
27072 // Break false dependency.
27073 if (PassThru.isUndef())
27074 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27075
27076 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27077 Mask);
27078 }
27079 default:
27080 break;
27081 }
27082 }
27083
27084 switch (IntNo) {
27085 default: return SDValue(); // Don't custom lower most intrinsics.
27086
27087 // ptest and testp intrinsics. The intrinsic these come from are designed to
27088 // return an integer value, not just an instruction so lower it to the ptest
27089 // or testp pattern and a setcc for the result.
27090 case Intrinsic::x86_avx512_ktestc_b:
27091 case Intrinsic::x86_avx512_ktestc_w:
27092 case Intrinsic::x86_avx512_ktestc_d:
27093 case Intrinsic::x86_avx512_ktestc_q:
27094 case Intrinsic::x86_avx512_ktestz_b:
27095 case Intrinsic::x86_avx512_ktestz_w:
27096 case Intrinsic::x86_avx512_ktestz_d:
27097 case Intrinsic::x86_avx512_ktestz_q:
27098 case Intrinsic::x86_sse41_ptestz:
27099 case Intrinsic::x86_sse41_ptestc:
27100 case Intrinsic::x86_sse41_ptestnzc:
27101 case Intrinsic::x86_avx_ptestz_256:
27102 case Intrinsic::x86_avx_ptestc_256:
27103 case Intrinsic::x86_avx_ptestnzc_256:
27104 case Intrinsic::x86_avx_vtestz_ps:
27105 case Intrinsic::x86_avx_vtestc_ps:
27106 case Intrinsic::x86_avx_vtestnzc_ps:
27107 case Intrinsic::x86_avx_vtestz_pd:
27108 case Intrinsic::x86_avx_vtestc_pd:
27109 case Intrinsic::x86_avx_vtestnzc_pd:
27110 case Intrinsic::x86_avx_vtestz_ps_256:
27111 case Intrinsic::x86_avx_vtestc_ps_256:
27112 case Intrinsic::x86_avx_vtestnzc_ps_256:
27113 case Intrinsic::x86_avx_vtestz_pd_256:
27114 case Intrinsic::x86_avx_vtestc_pd_256:
27115 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27116 unsigned TestOpc = X86ISD::PTEST;
27117 X86::CondCode X86CC;
27118 switch (IntNo) {
27119 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27120 case Intrinsic::x86_avx512_ktestc_b:
27121 case Intrinsic::x86_avx512_ktestc_w:
27122 case Intrinsic::x86_avx512_ktestc_d:
27123 case Intrinsic::x86_avx512_ktestc_q:
27124 // CF = 1
27125 TestOpc = X86ISD::KTEST;
27126 X86CC = X86::COND_B;
27127 break;
27128 case Intrinsic::x86_avx512_ktestz_b:
27129 case Intrinsic::x86_avx512_ktestz_w:
27130 case Intrinsic::x86_avx512_ktestz_d:
27131 case Intrinsic::x86_avx512_ktestz_q:
27132 TestOpc = X86ISD::KTEST;
27133 X86CC = X86::COND_E;
27134 break;
27135 case Intrinsic::x86_avx_vtestz_ps:
27136 case Intrinsic::x86_avx_vtestz_pd:
27137 case Intrinsic::x86_avx_vtestz_ps_256:
27138 case Intrinsic::x86_avx_vtestz_pd_256:
27139 TestOpc = X86ISD::TESTP;
27140 [[fallthrough]];
27141 case Intrinsic::x86_sse41_ptestz:
27142 case Intrinsic::x86_avx_ptestz_256:
27143 // ZF = 1
27144 X86CC = X86::COND_E;
27145 break;
27146 case Intrinsic::x86_avx_vtestc_ps:
27147 case Intrinsic::x86_avx_vtestc_pd:
27148 case Intrinsic::x86_avx_vtestc_ps_256:
27149 case Intrinsic::x86_avx_vtestc_pd_256:
27150 TestOpc = X86ISD::TESTP;
27151 [[fallthrough]];
27152 case Intrinsic::x86_sse41_ptestc:
27153 case Intrinsic::x86_avx_ptestc_256:
27154 // CF = 1
27155 X86CC = X86::COND_B;
27156 break;
27157 case Intrinsic::x86_avx_vtestnzc_ps:
27158 case Intrinsic::x86_avx_vtestnzc_pd:
27159 case Intrinsic::x86_avx_vtestnzc_ps_256:
27160 case Intrinsic::x86_avx_vtestnzc_pd_256:
27161 TestOpc = X86ISD::TESTP;
27162 [[fallthrough]];
27163 case Intrinsic::x86_sse41_ptestnzc:
27164 case Intrinsic::x86_avx_ptestnzc_256:
27165 // ZF and CF = 0
27166 X86CC = X86::COND_A;
27167 break;
27168 }
27169
27170 SDValue LHS = Op.getOperand(1);
27171 SDValue RHS = Op.getOperand(2);
27172 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27173 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27174 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27175 }
27176
27177 case Intrinsic::x86_sse42_pcmpistria128:
27178 case Intrinsic::x86_sse42_pcmpestria128:
27179 case Intrinsic::x86_sse42_pcmpistric128:
27180 case Intrinsic::x86_sse42_pcmpestric128:
27181 case Intrinsic::x86_sse42_pcmpistrio128:
27182 case Intrinsic::x86_sse42_pcmpestrio128:
27183 case Intrinsic::x86_sse42_pcmpistris128:
27184 case Intrinsic::x86_sse42_pcmpestris128:
27185 case Intrinsic::x86_sse42_pcmpistriz128:
27186 case Intrinsic::x86_sse42_pcmpestriz128: {
27187 unsigned Opcode;
27188 X86::CondCode X86CC;
27189 switch (IntNo) {
27190 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27191 case Intrinsic::x86_sse42_pcmpistria128:
27192 Opcode = X86ISD::PCMPISTR;
27193 X86CC = X86::COND_A;
27194 break;
27195 case Intrinsic::x86_sse42_pcmpestria128:
27196 Opcode = X86ISD::PCMPESTR;
27197 X86CC = X86::COND_A;
27198 break;
27199 case Intrinsic::x86_sse42_pcmpistric128:
27200 Opcode = X86ISD::PCMPISTR;
27201 X86CC = X86::COND_B;
27202 break;
27203 case Intrinsic::x86_sse42_pcmpestric128:
27204 Opcode = X86ISD::PCMPESTR;
27205 X86CC = X86::COND_B;
27206 break;
27207 case Intrinsic::x86_sse42_pcmpistrio128:
27208 Opcode = X86ISD::PCMPISTR;
27209 X86CC = X86::COND_O;
27210 break;
27211 case Intrinsic::x86_sse42_pcmpestrio128:
27212 Opcode = X86ISD::PCMPESTR;
27213 X86CC = X86::COND_O;
27214 break;
27215 case Intrinsic::x86_sse42_pcmpistris128:
27216 Opcode = X86ISD::PCMPISTR;
27217 X86CC = X86::COND_S;
27218 break;
27219 case Intrinsic::x86_sse42_pcmpestris128:
27220 Opcode = X86ISD::PCMPESTR;
27221 X86CC = X86::COND_S;
27222 break;
27223 case Intrinsic::x86_sse42_pcmpistriz128:
27224 Opcode = X86ISD::PCMPISTR;
27225 X86CC = X86::COND_E;
27226 break;
27227 case Intrinsic::x86_sse42_pcmpestriz128:
27228 Opcode = X86ISD::PCMPESTR;
27229 X86CC = X86::COND_E;
27230 break;
27231 }
27233 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27234 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27235 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27236 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27237 }
27238
27239 case Intrinsic::x86_sse42_pcmpistri128:
27240 case Intrinsic::x86_sse42_pcmpestri128: {
27241 unsigned Opcode;
27242 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27243 Opcode = X86ISD::PCMPISTR;
27244 else
27245 Opcode = X86ISD::PCMPESTR;
27246
27248 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27249 return DAG.getNode(Opcode, dl, VTs, NewOps);
27250 }
27251
27252 case Intrinsic::x86_sse42_pcmpistrm128:
27253 case Intrinsic::x86_sse42_pcmpestrm128: {
27254 unsigned Opcode;
27255 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27256 Opcode = X86ISD::PCMPISTR;
27257 else
27258 Opcode = X86ISD::PCMPESTR;
27259
27261 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27262 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27263 }
27264
27265 case Intrinsic::eh_sjlj_lsda: {
27266 MachineFunction &MF = DAG.getMachineFunction();
27267 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27268 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27269 auto &Context = MF.getContext();
27270 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27271 Twine(MF.getFunctionNumber()));
27272 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27273 DAG.getMCSymbol(S, PtrVT));
27274 }
27275
27276 case Intrinsic::x86_seh_lsda: {
27277 // Compute the symbol for the LSDA. We know it'll get emitted later.
27278 MachineFunction &MF = DAG.getMachineFunction();
27279 SDValue Op1 = Op.getOperand(1);
27280 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27283
27284 // Generate a simple absolute symbol reference. This intrinsic is only
27285 // supported on 32-bit Windows, which isn't PIC.
27286 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27287 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27288 }
27289
27290 case Intrinsic::eh_recoverfp: {
27291 SDValue FnOp = Op.getOperand(1);
27292 SDValue IncomingFPOp = Op.getOperand(2);
27293 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27294 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27295 if (!Fn)
27297 "llvm.eh.recoverfp must take a function as the first argument");
27298 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27299 }
27300
27301 case Intrinsic::localaddress: {
27302 // Returns one of the stack, base, or frame pointer registers, depending on
27303 // which is used to reference local variables.
27304 MachineFunction &MF = DAG.getMachineFunction();
27305 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27306 Register Reg;
27307 if (RegInfo->hasBasePointer(MF))
27308 Reg = RegInfo->getBaseRegister();
27309 else { // Handles the SP or FP case.
27310 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27311 if (CantUseFP)
27312 Reg = RegInfo->getPtrSizedStackRegister(MF);
27313 else
27314 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27315 }
27316 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27317 }
27318 case Intrinsic::x86_avx512_vp2intersect_q_512:
27319 case Intrinsic::x86_avx512_vp2intersect_q_256:
27320 case Intrinsic::x86_avx512_vp2intersect_q_128:
27321 case Intrinsic::x86_avx512_vp2intersect_d_512:
27322 case Intrinsic::x86_avx512_vp2intersect_d_256:
27323 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27324 SDLoc DL(Op);
27325 MVT MaskVT = Op.getSimpleValueType();
27326 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27328 Op.getOperand(1), Op.getOperand(2));
27329 SDValue Result0 =
27330 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27331 SDValue Result1 =
27332 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27333 return DAG.getMergeValues({Result0, Result1}, DL);
27334 }
27335 case Intrinsic::x86_mmx_pslli_w:
27336 case Intrinsic::x86_mmx_pslli_d:
27337 case Intrinsic::x86_mmx_pslli_q:
27338 case Intrinsic::x86_mmx_psrli_w:
27339 case Intrinsic::x86_mmx_psrli_d:
27340 case Intrinsic::x86_mmx_psrli_q:
27341 case Intrinsic::x86_mmx_psrai_w:
27342 case Intrinsic::x86_mmx_psrai_d: {
27343 SDLoc DL(Op);
27344 SDValue ShAmt = Op.getOperand(2);
27345 // If the argument is a constant, convert it to a target constant.
27346 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27347 // Clamp out of bounds shift amounts since they will otherwise be masked
27348 // to 8-bits which may make it no longer out of bounds.
27349 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27350 if (ShiftAmount == 0)
27351 return Op.getOperand(1);
27352
27353 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27354 Op.getOperand(0), Op.getOperand(1),
27355 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27356 }
27357
27358 unsigned NewIntrinsic;
27359 switch (IntNo) {
27360 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27361 case Intrinsic::x86_mmx_pslli_w:
27362 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27363 break;
27364 case Intrinsic::x86_mmx_pslli_d:
27365 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27366 break;
27367 case Intrinsic::x86_mmx_pslli_q:
27368 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27369 break;
27370 case Intrinsic::x86_mmx_psrli_w:
27371 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27372 break;
27373 case Intrinsic::x86_mmx_psrli_d:
27374 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27375 break;
27376 case Intrinsic::x86_mmx_psrli_q:
27377 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27378 break;
27379 case Intrinsic::x86_mmx_psrai_w:
27380 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27381 break;
27382 case Intrinsic::x86_mmx_psrai_d:
27383 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27384 break;
27385 }
27386
27387 // The vector shift intrinsics with scalars uses 32b shift amounts but
27388 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27389 // MMX register.
27390 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27391 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27392 DAG.getTargetConstant(NewIntrinsic, DL,
27394 Op.getOperand(1), ShAmt);
27395 }
27396 case Intrinsic::thread_pointer: {
27397 if (Subtarget.isTargetELF()) {
27398 SDLoc dl(Op);
27399 EVT PtrVT = Op.getValueType();
27400 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27402 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27403 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27404 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27405 }
27407 "Target OS doesn't support __builtin_thread_pointer() yet.");
27408 }
27409 }
27410}
27411
27413 SDValue Src, SDValue Mask, SDValue Base,
27414 SDValue Index, SDValue ScaleOp, SDValue Chain,
27415 const X86Subtarget &Subtarget) {
27416 SDLoc dl(Op);
27417 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27418 // Scale must be constant.
27419 if (!C)
27420 return SDValue();
27421 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27422 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27423 TLI.getPointerTy(DAG.getDataLayout()));
27424 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27425 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27426 // If source is undef or we know it won't be used, use a zero vector
27427 // to break register dependency.
27428 // TODO: use undef instead and let BreakFalseDeps deal with it?
27429 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27430 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27431
27432 // Cast mask to an integer type.
27433 Mask = DAG.getBitcast(MaskVT, Mask);
27434
27436
27437 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27438 SDValue Res =
27440 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27441 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27442}
27443
27445 SDValue Src, SDValue Mask, SDValue Base,
27446 SDValue Index, SDValue ScaleOp, SDValue Chain,
27447 const X86Subtarget &Subtarget) {
27448 MVT VT = Op.getSimpleValueType();
27449 SDLoc dl(Op);
27450 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27451 // Scale must be constant.
27452 if (!C)
27453 return SDValue();
27454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27455 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27456 TLI.getPointerTy(DAG.getDataLayout()));
27457 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27459 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27460
27461 // We support two versions of the gather intrinsics. One with scalar mask and
27462 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27463 if (Mask.getValueType() != MaskVT)
27464 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27465
27466 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27467 // If source is undef or we know it won't be used, use a zero vector
27468 // to break register dependency.
27469 // TODO: use undef instead and let BreakFalseDeps deal with it?
27470 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27471 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27472
27474
27475 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27476 SDValue Res =
27478 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27479 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27480}
27481
27483 SDValue Src, SDValue Mask, SDValue Base,
27484 SDValue Index, SDValue ScaleOp, SDValue Chain,
27485 const X86Subtarget &Subtarget) {
27486 SDLoc dl(Op);
27487 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27488 // Scale must be constant.
27489 if (!C)
27490 return SDValue();
27491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27492 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27493 TLI.getPointerTy(DAG.getDataLayout()));
27494 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27495 Src.getSimpleValueType().getVectorNumElements());
27496 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27497
27498 // We support two versions of the scatter intrinsics. One with scalar mask and
27499 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27500 if (Mask.getValueType() != MaskVT)
27501 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27502
27504
27505 SDVTList VTs = DAG.getVTList(MVT::Other);
27506 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27507 SDValue Res =
27509 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27510 return Res;
27511}
27512
27514 SDValue Mask, SDValue Base, SDValue Index,
27515 SDValue ScaleOp, SDValue Chain,
27516 const X86Subtarget &Subtarget) {
27517 SDLoc dl(Op);
27518 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27519 // Scale must be constant.
27520 if (!C)
27521 return SDValue();
27522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27523 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27524 TLI.getPointerTy(DAG.getDataLayout()));
27525 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27526 SDValue Segment = DAG.getRegister(0, MVT::i32);
27527 MVT MaskVT =
27528 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27529 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27530 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27531 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27532 return SDValue(Res, 0);
27533}
27534
27535/// Handles the lowering of builtin intrinsics with chain that return their
27536/// value into registers EDX:EAX.
27537/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27538/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27539/// TargetOpcode.
27540/// Returns a Glue value which can be used to add extra copy-from-reg if the
27541/// expanded intrinsics implicitly defines extra registers (i.e. not just
27542/// EDX:EAX).
27544 SelectionDAG &DAG,
27545 unsigned TargetOpcode,
27546 unsigned SrcReg,
27547 const X86Subtarget &Subtarget,
27549 SDValue Chain = N->getOperand(0);
27550 SDValue Glue;
27551
27552 if (SrcReg) {
27553 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27554 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27555 Glue = Chain.getValue(1);
27556 }
27557
27558 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27559 SDValue N1Ops[] = {Chain, Glue};
27560 SDNode *N1 = DAG.getMachineNode(
27561 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27562 Chain = SDValue(N1, 0);
27563
27564 // Reads the content of XCR and returns it in registers EDX:EAX.
27565 SDValue LO, HI;
27566 if (Subtarget.is64Bit()) {
27567 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27568 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27569 LO.getValue(2));
27570 } else {
27571 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27572 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27573 LO.getValue(2));
27574 }
27575 Chain = HI.getValue(1);
27576 Glue = HI.getValue(2);
27577
27578 if (Subtarget.is64Bit()) {
27579 // Merge the two 32-bit values into a 64-bit one.
27580 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27581 DAG.getConstant(32, DL, MVT::i8));
27582 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27583 Results.push_back(Chain);
27584 return Glue;
27585 }
27586
27587 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27588 SDValue Ops[] = { LO, HI };
27589 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27590 Results.push_back(Pair);
27591 Results.push_back(Chain);
27592 return Glue;
27593}
27594
27595/// Handles the lowering of builtin intrinsics that read the time stamp counter
27596/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27597/// READCYCLECOUNTER nodes.
27598static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27599 SelectionDAG &DAG,
27600 const X86Subtarget &Subtarget,
27602 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27603 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27604 // and the EAX register is loaded with the low-order 32 bits.
27605 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27606 /* NoRegister */0, Subtarget,
27607 Results);
27608 if (Opcode != X86::RDTSCP)
27609 return;
27610
27611 SDValue Chain = Results[1];
27612 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27613 // the ECX register. Add 'ecx' explicitly to the chain.
27614 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27615 Results[1] = ecx;
27616 Results.push_back(ecx.getValue(1));
27617}
27618
27620 SelectionDAG &DAG) {
27622 SDLoc DL(Op);
27623 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27624 Results);
27625 return DAG.getMergeValues(Results, DL);
27626}
27627
27630 SDValue Chain = Op.getOperand(0);
27631 SDValue RegNode = Op.getOperand(2);
27632 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27633 if (!EHInfo)
27634 report_fatal_error("EH registrations only live in functions using WinEH");
27635
27636 // Cast the operand to an alloca, and remember the frame index.
27637 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27638 if (!FINode)
27639 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27640 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27641
27642 // Return the chain operand without making any DAG nodes.
27643 return Chain;
27644}
27645
27648 SDValue Chain = Op.getOperand(0);
27649 SDValue EHGuard = Op.getOperand(2);
27650 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27651 if (!EHInfo)
27652 report_fatal_error("EHGuard only live in functions using WinEH");
27653
27654 // Cast the operand to an alloca, and remember the frame index.
27655 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27656 if (!FINode)
27657 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27658 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27659
27660 // Return the chain operand without making any DAG nodes.
27661 return Chain;
27662}
27663
27664/// Emit Truncating Store with signed or unsigned saturation.
27665static SDValue
27666EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27667 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27668 SelectionDAG &DAG) {
27669 SDVTList VTs = DAG.getVTList(MVT::Other);
27670 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27671 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27672 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27673 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27674}
27675
27676/// Emit Masked Truncating Store with signed or unsigned saturation.
27677static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27678 const SDLoc &DL,
27679 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27680 MachineMemOperand *MMO, SelectionDAG &DAG) {
27681 SDVTList VTs = DAG.getVTList(MVT::Other);
27682 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27683 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27684 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27685}
27686
27688 const MachineFunction &MF) {
27689 if (!Subtarget.is64Bit())
27690 return false;
27691 // 64-bit targets support extended Swift async frame setup,
27692 // except for targets that use the windows 64 prologue.
27693 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27694}
27695
27697 SelectionDAG &DAG) {
27698 unsigned IntNo = Op.getConstantOperandVal(1);
27699 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27700 if (!IntrData) {
27701 switch (IntNo) {
27702
27703 case Intrinsic::swift_async_context_addr: {
27704 SDLoc dl(Op);
27705 auto &MF = DAG.getMachineFunction();
27706 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27707 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27709 X86FI->setHasSwiftAsyncContext(true);
27710 SDValue Chain = Op->getOperand(0);
27711 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27712 SDValue Result =
27713 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27714 DAG.getTargetConstant(8, dl, MVT::i32)),
27715 0);
27716 // Return { result, chain }.
27717 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27718 CopyRBP.getValue(1));
27719 } else {
27720 // No special extended frame, create or reuse an existing stack slot.
27721 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27722 if (!X86FI->getSwiftAsyncContextFrameIdx())
27723 X86FI->setSwiftAsyncContextFrameIdx(
27724 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27725 false));
27726 SDValue Result =
27727 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27728 PtrSize == 8 ? MVT::i64 : MVT::i32);
27729 // Return { result, chain }.
27730 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27731 Op->getOperand(0));
27732 }
27733 }
27734
27735 case llvm::Intrinsic::x86_seh_ehregnode:
27736 return MarkEHRegistrationNode(Op, DAG);
27737 case llvm::Intrinsic::x86_seh_ehguard:
27738 return MarkEHGuard(Op, DAG);
27739 case llvm::Intrinsic::x86_rdpkru: {
27740 SDLoc dl(Op);
27741 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27742 // Create a RDPKRU node and pass 0 to the ECX parameter.
27743 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27744 DAG.getConstant(0, dl, MVT::i32));
27745 }
27746 case llvm::Intrinsic::x86_wrpkru: {
27747 SDLoc dl(Op);
27748 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27749 // to the EDX and ECX parameters.
27750 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27751 Op.getOperand(0), Op.getOperand(2),
27752 DAG.getConstant(0, dl, MVT::i32),
27753 DAG.getConstant(0, dl, MVT::i32));
27754 }
27755 case llvm::Intrinsic::asan_check_memaccess: {
27756 // Mark this as adjustsStack because it will be lowered to a call.
27758 // Don't do anything here, we will expand these intrinsics out later.
27759 return Op;
27760 }
27761 case llvm::Intrinsic::x86_flags_read_u32:
27762 case llvm::Intrinsic::x86_flags_read_u64:
27763 case llvm::Intrinsic::x86_flags_write_u32:
27764 case llvm::Intrinsic::x86_flags_write_u64: {
27765 // We need a frame pointer because this will get lowered to a PUSH/POP
27766 // sequence.
27769 // Don't do anything here, we will expand these intrinsics out later
27770 // during FinalizeISel in EmitInstrWithCustomInserter.
27771 return Op;
27772 }
27773 case Intrinsic::x86_lwpins32:
27774 case Intrinsic::x86_lwpins64:
27775 case Intrinsic::x86_umwait:
27776 case Intrinsic::x86_tpause: {
27777 SDLoc dl(Op);
27778 SDValue Chain = Op->getOperand(0);
27779 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27780 unsigned Opcode;
27781
27782 switch (IntNo) {
27783 default: llvm_unreachable("Impossible intrinsic");
27784 case Intrinsic::x86_umwait:
27785 Opcode = X86ISD::UMWAIT;
27786 break;
27787 case Intrinsic::x86_tpause:
27788 Opcode = X86ISD::TPAUSE;
27789 break;
27790 case Intrinsic::x86_lwpins32:
27791 case Intrinsic::x86_lwpins64:
27792 Opcode = X86ISD::LWPINS;
27793 break;
27794 }
27795
27797 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27798 Op->getOperand(3), Op->getOperand(4));
27799 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27800 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27801 Operation.getValue(1));
27802 }
27803 case Intrinsic::x86_enqcmd:
27804 case Intrinsic::x86_enqcmds: {
27805 SDLoc dl(Op);
27806 SDValue Chain = Op.getOperand(0);
27807 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27808 unsigned Opcode;
27809 switch (IntNo) {
27810 default: llvm_unreachable("Impossible intrinsic!");
27811 case Intrinsic::x86_enqcmd:
27812 Opcode = X86ISD::ENQCMD;
27813 break;
27814 case Intrinsic::x86_enqcmds:
27815 Opcode = X86ISD::ENQCMDS;
27816 break;
27817 }
27818 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27819 Op.getOperand(3));
27820 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27821 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27822 Operation.getValue(1));
27823 }
27824 case Intrinsic::x86_aesenc128kl:
27825 case Intrinsic::x86_aesdec128kl:
27826 case Intrinsic::x86_aesenc256kl:
27827 case Intrinsic::x86_aesdec256kl: {
27828 SDLoc DL(Op);
27829 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27830 SDValue Chain = Op.getOperand(0);
27831 unsigned Opcode;
27832
27833 switch (IntNo) {
27834 default: llvm_unreachable("Impossible intrinsic");
27835 case Intrinsic::x86_aesenc128kl:
27836 Opcode = X86ISD::AESENC128KL;
27837 break;
27838 case Intrinsic::x86_aesdec128kl:
27839 Opcode = X86ISD::AESDEC128KL;
27840 break;
27841 case Intrinsic::x86_aesenc256kl:
27842 Opcode = X86ISD::AESENC256KL;
27843 break;
27844 case Intrinsic::x86_aesdec256kl:
27845 Opcode = X86ISD::AESDEC256KL;
27846 break;
27847 }
27848
27850 MachineMemOperand *MMO = MemIntr->getMemOperand();
27851 EVT MemVT = MemIntr->getMemoryVT();
27853 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27854 MMO);
27855 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27856
27857 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27858 {ZF, Operation.getValue(0), Operation.getValue(2)});
27859 }
27860 case Intrinsic::x86_aesencwide128kl:
27861 case Intrinsic::x86_aesdecwide128kl:
27862 case Intrinsic::x86_aesencwide256kl:
27863 case Intrinsic::x86_aesdecwide256kl: {
27864 SDLoc DL(Op);
27865 SDVTList VTs = DAG.getVTList(
27866 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27867 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27868 SDValue Chain = Op.getOperand(0);
27869 unsigned Opcode;
27870
27871 switch (IntNo) {
27872 default: llvm_unreachable("Impossible intrinsic");
27873 case Intrinsic::x86_aesencwide128kl:
27874 Opcode = X86ISD::AESENCWIDE128KL;
27875 break;
27876 case Intrinsic::x86_aesdecwide128kl:
27877 Opcode = X86ISD::AESDECWIDE128KL;
27878 break;
27879 case Intrinsic::x86_aesencwide256kl:
27880 Opcode = X86ISD::AESENCWIDE256KL;
27881 break;
27882 case Intrinsic::x86_aesdecwide256kl:
27883 Opcode = X86ISD::AESDECWIDE256KL;
27884 break;
27885 }
27886
27888 MachineMemOperand *MMO = MemIntr->getMemOperand();
27889 EVT MemVT = MemIntr->getMemoryVT();
27891 Opcode, DL, VTs,
27892 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27893 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27894 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27895 MemVT, MMO);
27896 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27897
27898 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27899 {ZF, Operation.getValue(1), Operation.getValue(2),
27900 Operation.getValue(3), Operation.getValue(4),
27901 Operation.getValue(5), Operation.getValue(6),
27902 Operation.getValue(7), Operation.getValue(8),
27903 Operation.getValue(9)});
27904 }
27905 case Intrinsic::x86_testui: {
27906 SDLoc dl(Op);
27907 SDValue Chain = Op.getOperand(0);
27908 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27909 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27910 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27911 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27912 Operation.getValue(1));
27913 }
27914 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27915 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27916 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27917 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27918 case Intrinsic::x86_t2rpntlvwz0_internal:
27919 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27920 case Intrinsic::x86_t2rpntlvwz1_internal:
27921 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27922 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27924 unsigned IntNo = Op.getConstantOperandVal(1);
27925 unsigned Opc = 0;
27926 switch (IntNo) {
27927 default:
27928 llvm_unreachable("Unexpected intrinsic!");
27929 case Intrinsic::x86_t2rpntlvwz0_internal:
27930 Opc = X86::PT2RPNTLVWZ0V;
27931 break;
27932 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27933 Opc = X86::PT2RPNTLVWZ0T1V;
27934 break;
27935 case Intrinsic::x86_t2rpntlvwz1_internal:
27936 Opc = X86::PT2RPNTLVWZ1V;
27937 break;
27938 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27939 Opc = X86::PT2RPNTLVWZ1T1V;
27940 break;
27941 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27942 Opc = X86::PT2RPNTLVWZ0RSV;
27943 break;
27944 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27945 Opc = X86::PT2RPNTLVWZ0RST1V;
27946 break;
27947 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27948 Opc = X86::PT2RPNTLVWZ1RSV;
27949 break;
27950 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27951 Opc = X86::PT2RPNTLVWZ1RST1V;
27952 break;
27953 }
27954
27955 SDLoc DL(Op);
27956 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27957
27958 SDValue Ops[] = {Op.getOperand(2), // Row
27959 Op.getOperand(3), // Col0
27960 Op.getOperand(4), // Col1
27961 Op.getOperand(5), // Base
27962 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27963 Op.getOperand(6), // Index
27964 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27965 DAG.getRegister(0, MVT::i16), // Segment
27966 Op.getOperand(0)}; // Chain
27967
27968 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
27969 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
27970 SDValue(Res, 0));
27971 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
27972 SDValue(Res, 0));
27973 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
27974 }
27975 case Intrinsic::x86_atomic_bts_rm:
27976 case Intrinsic::x86_atomic_btc_rm:
27977 case Intrinsic::x86_atomic_btr_rm: {
27978 SDLoc DL(Op);
27979 MVT VT = Op.getSimpleValueType();
27980 SDValue Chain = Op.getOperand(0);
27981 SDValue Op1 = Op.getOperand(2);
27982 SDValue Op2 = Op.getOperand(3);
27983 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
27984 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
27986 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27987 SDValue Res =
27988 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27989 {Chain, Op1, Op2}, VT, MMO);
27990 Chain = Res.getValue(1);
27991 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
27992 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27993 }
27994 case Intrinsic::x86_atomic_bts:
27995 case Intrinsic::x86_atomic_btc:
27996 case Intrinsic::x86_atomic_btr: {
27997 SDLoc DL(Op);
27998 MVT VT = Op.getSimpleValueType();
27999 SDValue Chain = Op.getOperand(0);
28000 SDValue Op1 = Op.getOperand(2);
28001 SDValue Op2 = Op.getOperand(3);
28002 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28003 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28004 : X86ISD::LBTR;
28005 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28006 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28007 SDValue Res =
28008 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28009 {Chain, Op1, Op2, Size}, VT, MMO);
28010 Chain = Res.getValue(1);
28011 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28012 unsigned Imm = Op2->getAsZExtVal();
28013 if (Imm)
28014 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28015 DAG.getShiftAmountConstant(Imm, VT, DL));
28016 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28017 }
28018 case Intrinsic::x86_cmpccxadd32:
28019 case Intrinsic::x86_cmpccxadd64: {
28020 SDLoc DL(Op);
28021 SDValue Chain = Op.getOperand(0);
28022 SDValue Addr = Op.getOperand(2);
28023 SDValue Src1 = Op.getOperand(3);
28024 SDValue Src2 = Op.getOperand(4);
28025 SDValue CC = Op.getOperand(5);
28026 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28028 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28029 MVT::i32, MMO);
28030 return Operation;
28031 }
28032 case Intrinsic::x86_aadd32:
28033 case Intrinsic::x86_aadd64:
28034 case Intrinsic::x86_aand32:
28035 case Intrinsic::x86_aand64:
28036 case Intrinsic::x86_aor32:
28037 case Intrinsic::x86_aor64:
28038 case Intrinsic::x86_axor32:
28039 case Intrinsic::x86_axor64: {
28040 SDLoc DL(Op);
28041 SDValue Chain = Op.getOperand(0);
28042 SDValue Op1 = Op.getOperand(2);
28043 SDValue Op2 = Op.getOperand(3);
28044 MVT VT = Op2.getSimpleValueType();
28045 unsigned Opc = 0;
28046 switch (IntNo) {
28047 default:
28048 llvm_unreachable("Unknown Intrinsic");
28049 case Intrinsic::x86_aadd32:
28050 case Intrinsic::x86_aadd64:
28051 Opc = X86ISD::AADD;
28052 break;
28053 case Intrinsic::x86_aand32:
28054 case Intrinsic::x86_aand64:
28055 Opc = X86ISD::AAND;
28056 break;
28057 case Intrinsic::x86_aor32:
28058 case Intrinsic::x86_aor64:
28059 Opc = X86ISD::AOR;
28060 break;
28061 case Intrinsic::x86_axor32:
28062 case Intrinsic::x86_axor64:
28063 Opc = X86ISD::AXOR;
28064 break;
28065 }
28066 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28067 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28068 {Chain, Op1, Op2}, VT, MMO);
28069 }
28070 case Intrinsic::x86_atomic_add_cc:
28071 case Intrinsic::x86_atomic_sub_cc:
28072 case Intrinsic::x86_atomic_or_cc:
28073 case Intrinsic::x86_atomic_and_cc:
28074 case Intrinsic::x86_atomic_xor_cc: {
28075 SDLoc DL(Op);
28076 SDValue Chain = Op.getOperand(0);
28077 SDValue Op1 = Op.getOperand(2);
28078 SDValue Op2 = Op.getOperand(3);
28079 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28080 MVT VT = Op2.getSimpleValueType();
28081 unsigned Opc = 0;
28082 switch (IntNo) {
28083 default:
28084 llvm_unreachable("Unknown Intrinsic");
28085 case Intrinsic::x86_atomic_add_cc:
28086 Opc = X86ISD::LADD;
28087 break;
28088 case Intrinsic::x86_atomic_sub_cc:
28089 Opc = X86ISD::LSUB;
28090 break;
28091 case Intrinsic::x86_atomic_or_cc:
28092 Opc = X86ISD::LOR;
28093 break;
28094 case Intrinsic::x86_atomic_and_cc:
28095 Opc = X86ISD::LAND;
28096 break;
28097 case Intrinsic::x86_atomic_xor_cc:
28098 Opc = X86ISD::LXOR;
28099 break;
28100 }
28101 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28102 SDValue LockArith =
28103 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28104 {Chain, Op1, Op2}, VT, MMO);
28105 Chain = LockArith.getValue(1);
28106 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28107 }
28108 }
28109 return SDValue();
28110 }
28111
28112 SDLoc dl(Op);
28113 switch(IntrData->Type) {
28114 default: llvm_unreachable("Unknown Intrinsic Type");
28115 case RDSEED:
28116 case RDRAND: {
28117 // Emit the node with the right value type.
28118 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28119 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28120
28121 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28122 // Otherwise return the value from Rand, which is always 0, casted to i32.
28123 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28124 DAG.getConstant(1, dl, Op->getValueType(1)),
28125 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28126 SDValue(Result.getNode(), 1)};
28127 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28128
28129 // Return { result, isValid, chain }.
28130 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28131 SDValue(Result.getNode(), 2));
28132 }
28133 case GATHER_AVX2: {
28134 SDValue Chain = Op.getOperand(0);
28135 SDValue Src = Op.getOperand(2);
28136 SDValue Base = Op.getOperand(3);
28137 SDValue Index = Op.getOperand(4);
28138 SDValue Mask = Op.getOperand(5);
28139 SDValue Scale = Op.getOperand(6);
28140 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28141 Scale, Chain, Subtarget);
28142 }
28143 case GATHER: {
28144 //gather(v1, mask, index, base, scale);
28145 SDValue Chain = Op.getOperand(0);
28146 SDValue Src = Op.getOperand(2);
28147 SDValue Base = Op.getOperand(3);
28148 SDValue Index = Op.getOperand(4);
28149 SDValue Mask = Op.getOperand(5);
28150 SDValue Scale = Op.getOperand(6);
28151 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28152 Chain, Subtarget);
28153 }
28154 case SCATTER: {
28155 //scatter(base, mask, index, v1, scale);
28156 SDValue Chain = Op.getOperand(0);
28157 SDValue Base = Op.getOperand(2);
28158 SDValue Mask = Op.getOperand(3);
28159 SDValue Index = Op.getOperand(4);
28160 SDValue Src = Op.getOperand(5);
28161 SDValue Scale = Op.getOperand(6);
28162 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28163 Scale, Chain, Subtarget);
28164 }
28165 case PREFETCH: {
28166 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28167 assert((HintVal == 2 || HintVal == 3) &&
28168 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28169 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28170 SDValue Chain = Op.getOperand(0);
28171 SDValue Mask = Op.getOperand(2);
28172 SDValue Index = Op.getOperand(3);
28173 SDValue Base = Op.getOperand(4);
28174 SDValue Scale = Op.getOperand(5);
28175 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28176 Subtarget);
28177 }
28178 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28179 case RDTSC: {
28181 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28182 Results);
28183 return DAG.getMergeValues(Results, dl);
28184 }
28185 // Read Performance Monitoring Counters.
28186 case RDPMC:
28187 // Read Processor Register.
28188 case RDPRU:
28189 // GetExtended Control Register.
28190 case XGETBV: {
28192
28193 // RDPMC uses ECX to select the index of the performance counter to read.
28194 // RDPRU uses ECX to select the processor register to read.
28195 // XGETBV uses ECX to select the index of the XCR register to return.
28196 // The result is stored into registers EDX:EAX.
28197 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28198 Subtarget, Results);
28199 return DAG.getMergeValues(Results, dl);
28200 }
28201 // XTEST intrinsics.
28202 case XTEST: {
28203 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28204 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28205
28206 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28207 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28208 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28209 Ret, SDValue(InTrans.getNode(), 1));
28210 }
28213 case TRUNCATE_TO_MEM_VI32: {
28214 SDValue Mask = Op.getOperand(4);
28215 SDValue DataToTruncate = Op.getOperand(3);
28216 SDValue Addr = Op.getOperand(2);
28217 SDValue Chain = Op.getOperand(0);
28218
28220 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28221
28222 EVT MemVT = MemIntr->getMemoryVT();
28223
28224 uint16_t TruncationOp = IntrData->Opc0;
28225 switch (TruncationOp) {
28226 case X86ISD::VTRUNC: {
28227 if (isAllOnesConstant(Mask)) // return just a truncate store
28228 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28229 MemIntr->getMemOperand());
28230
28231 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28232 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28233 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28234
28235 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28236 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28237 true /* truncating */);
28238 }
28239 case X86ISD::VTRUNCUS:
28240 case X86ISD::VTRUNCS: {
28241 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28242 if (isAllOnesConstant(Mask))
28243 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28244 MemIntr->getMemOperand(), DAG);
28245
28246 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28247 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28248
28249 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28250 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28251 }
28252 default:
28253 llvm_unreachable("Unsupported truncstore intrinsic");
28254 }
28255 }
28256 case INTR_TYPE_CAST_MMX:
28257 return SDValue(); // handled in combineINTRINSIC_*
28258 }
28259}
28260
28261SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28262 SelectionDAG &DAG) const {
28263 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28264 MFI.setReturnAddressIsTaken(true);
28265
28266 unsigned Depth = Op.getConstantOperandVal(0);
28267 SDLoc dl(Op);
28268 EVT PtrVT = Op.getValueType();
28269
28270 if (Depth > 0) {
28271 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28272 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28273 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28274 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28275 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28276 MachinePointerInfo());
28277 }
28278
28279 // Just load the return address.
28280 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28281 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28282 MachinePointerInfo());
28283}
28284
28285SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28286 SelectionDAG &DAG) const {
28288 return getReturnAddressFrameIndex(DAG);
28289}
28290
28291SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28292 MachineFunction &MF = DAG.getMachineFunction();
28293 MachineFrameInfo &MFI = MF.getFrameInfo();
28294 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28295 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28296 EVT VT = Op.getValueType();
28297
28298 MFI.setFrameAddressIsTaken(true);
28299
28300 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28301 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28302 // is not possible to crawl up the stack without looking at the unwind codes
28303 // simultaneously.
28304 int FrameAddrIndex = FuncInfo->getFAIndex();
28305 if (!FrameAddrIndex) {
28306 // Set up a frame object for the return address.
28307 unsigned SlotSize = RegInfo->getSlotSize();
28308 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28309 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28310 FuncInfo->setFAIndex(FrameAddrIndex);
28311 }
28312 return DAG.getFrameIndex(FrameAddrIndex, VT);
28313 }
28314
28315 Register FrameReg =
28316 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28317 SDLoc dl(Op); // FIXME probably not meaningful
28318 unsigned Depth = Op.getConstantOperandVal(0);
28319 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28320 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28321 "Invalid Frame Register!");
28322 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28323 while (Depth--)
28324 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28325 MachinePointerInfo());
28326 return FrameAddr;
28327}
28328
28329// FIXME? Maybe this could be a TableGen attribute on some registers and
28330// this table could be generated automatically from RegInfo.
28332 const MachineFunction &MF) const {
28333 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28334
28336 .Case("esp", X86::ESP)
28337 .Case("rsp", X86::RSP)
28338 .Case("ebp", X86::EBP)
28339 .Case("rbp", X86::RBP)
28340 .Case("r14", X86::R14)
28341 .Case("r15", X86::R15)
28342 .Default(0);
28343
28344 if (Reg == X86::EBP || Reg == X86::RBP) {
28345 if (!TFI.hasFP(MF))
28346 report_fatal_error("register " + StringRef(RegName) +
28347 " is allocatable: function has no frame pointer");
28348#ifndef NDEBUG
28349 else {
28350 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28351 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28352 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28353 "Invalid Frame Register!");
28354 }
28355#endif
28356 }
28357
28358 return Reg;
28359}
28360
28361SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28362 SelectionDAG &DAG) const {
28363 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28364 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28365}
28366
28368 const Constant *PersonalityFn) const {
28369 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28370 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28371
28372 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28373}
28374
28376 const Constant *PersonalityFn) const {
28377 // Funclet personalities don't use selectors (the runtime does the selection).
28379 return X86::NoRegister;
28380 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28381}
28382
28384 return Subtarget.isTargetWin64();
28385}
28386
28387SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28388 SDValue Chain = Op.getOperand(0);
28389 SDValue Offset = Op.getOperand(1);
28390 SDValue Handler = Op.getOperand(2);
28391 SDLoc dl (Op);
28392
28393 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28394 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28395 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28396 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28397 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28398 "Invalid Frame Register!");
28399 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28400 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28401
28402 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28403 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28404 dl));
28405 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28406 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28407 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28408
28409 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28410 DAG.getRegister(StoreAddrReg, PtrVT));
28411}
28412
28413SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28414 SelectionDAG &DAG) const {
28415 SDLoc DL(Op);
28416 // If the subtarget is not 64bit, we may need the global base reg
28417 // after isel expand pseudo, i.e., after CGBR pass ran.
28418 // Therefore, ask for the GlobalBaseReg now, so that the pass
28419 // inserts the code for us in case we need it.
28420 // Otherwise, we will end up in a situation where we will
28421 // reference a virtual register that is not defined!
28422 if (!Subtarget.is64Bit()) {
28423 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28424 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28425 }
28426 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28427 DAG.getVTList(MVT::i32, MVT::Other),
28428 Op.getOperand(0), Op.getOperand(1));
28429}
28430
28431SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28432 SelectionDAG &DAG) const {
28433 SDLoc DL(Op);
28434 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28435 Op.getOperand(0), Op.getOperand(1));
28436}
28437
28438SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28439 SelectionDAG &DAG) const {
28440 SDLoc DL(Op);
28441 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28442 Op.getOperand(0));
28443}
28444
28446 return Op.getOperand(0);
28447}
28448
28449SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28450 SelectionDAG &DAG) const {
28451 SDValue Root = Op.getOperand(0);
28452 SDValue Trmp = Op.getOperand(1); // trampoline
28453 SDValue FPtr = Op.getOperand(2); // nested function
28454 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28455 SDLoc dl (Op);
28456
28457 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28458 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28459
28460 if (Subtarget.is64Bit()) {
28461 SDValue OutChains[6];
28462
28463 // Large code-model.
28464 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28465 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28466
28467 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28468 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28469
28470 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28471
28472 // Load the pointer to the nested function into R11.
28473 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28474 SDValue Addr = Trmp;
28475 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28476 Addr, MachinePointerInfo(TrmpAddr));
28477
28478 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28479 DAG.getConstant(2, dl, MVT::i64));
28480 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28481 MachinePointerInfo(TrmpAddr, 2), Align(2));
28482
28483 // Load the 'nest' parameter value into R10.
28484 // R10 is specified in X86CallingConv.td
28485 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28486 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28487 DAG.getConstant(10, dl, MVT::i64));
28488 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28489 Addr, MachinePointerInfo(TrmpAddr, 10));
28490
28491 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28492 DAG.getConstant(12, dl, MVT::i64));
28493 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28494 MachinePointerInfo(TrmpAddr, 12), Align(2));
28495
28496 // Jump to the nested function.
28497 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28498 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28499 DAG.getConstant(20, dl, MVT::i64));
28500 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28501 Addr, MachinePointerInfo(TrmpAddr, 20));
28502
28503 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28504 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28505 DAG.getConstant(22, dl, MVT::i64));
28506 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28507 Addr, MachinePointerInfo(TrmpAddr, 22));
28508
28509 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28510 } else {
28511 const Function *Func =
28512 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28513 CallingConv::ID CC = Func->getCallingConv();
28514 unsigned NestReg;
28515
28516 switch (CC) {
28517 default:
28518 llvm_unreachable("Unsupported calling convention");
28519 case CallingConv::C:
28521 // Pass 'nest' parameter in ECX.
28522 // Must be kept in sync with X86CallingConv.td
28523 NestReg = X86::ECX;
28524
28525 // Check that ECX wasn't needed by an 'inreg' parameter.
28526 FunctionType *FTy = Func->getFunctionType();
28527 const AttributeList &Attrs = Func->getAttributes();
28528
28529 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28530 unsigned InRegCount = 0;
28531 unsigned Idx = 0;
28532
28533 for (FunctionType::param_iterator I = FTy->param_begin(),
28534 E = FTy->param_end(); I != E; ++I, ++Idx)
28535 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28536 const DataLayout &DL = DAG.getDataLayout();
28537 // FIXME: should only count parameters that are lowered to integers.
28538 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28539 }
28540
28541 if (InRegCount > 2) {
28542 report_fatal_error("Nest register in use - reduce number of inreg"
28543 " parameters!");
28544 }
28545 }
28546 break;
28547 }
28550 case CallingConv::Fast:
28551 case CallingConv::Tail:
28553 // Pass 'nest' parameter in EAX.
28554 // Must be kept in sync with X86CallingConv.td
28555 NestReg = X86::EAX;
28556 break;
28557 }
28558
28559 SDValue OutChains[4];
28560 SDValue Addr, Disp;
28561
28562 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28563 DAG.getConstant(10, dl, MVT::i32));
28564 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28565
28566 // This is storing the opcode for MOV32ri.
28567 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28568 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28569 OutChains[0] =
28570 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28571 Trmp, MachinePointerInfo(TrmpAddr));
28572
28573 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28574 DAG.getConstant(1, dl, MVT::i32));
28575 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28576 MachinePointerInfo(TrmpAddr, 1), Align(1));
28577
28578 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28579 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28580 DAG.getConstant(5, dl, MVT::i32));
28581 OutChains[2] =
28582 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28583 MachinePointerInfo(TrmpAddr, 5), Align(1));
28584
28585 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28586 DAG.getConstant(6, dl, MVT::i32));
28587 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28588 MachinePointerInfo(TrmpAddr, 6), Align(1));
28589
28590 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28591 }
28592}
28593
28594SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28595 SelectionDAG &DAG) const {
28596 /*
28597 The rounding mode is in bits 11:10 of FPSR, and has the following
28598 settings:
28599 00 Round to nearest
28600 01 Round to -inf
28601 10 Round to +inf
28602 11 Round to 0
28603
28604 GET_ROUNDING, on the other hand, expects the following:
28605 -1 Undefined
28606 0 Round to 0
28607 1 Round to nearest
28608 2 Round to +inf
28609 3 Round to -inf
28610
28611 To perform the conversion, we use a packed lookup table of the four 2-bit
28612 values that we can index by FPSP[11:10]
28613 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28614
28615 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28616 */
28617
28618 MachineFunction &MF = DAG.getMachineFunction();
28619 MVT VT = Op.getSimpleValueType();
28620 SDLoc DL(Op);
28621
28622 // Save FP Control Word to stack slot
28623 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28624 SDValue StackSlot =
28625 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28626
28627 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28628
28629 SDValue Chain = Op.getOperand(0);
28630 SDValue Ops[] = {Chain, StackSlot};
28632 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28634
28635 // Load FP Control Word from stack slot
28636 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28637 Chain = CWD.getValue(1);
28638
28639 // Mask and turn the control bits into a shift for the lookup table.
28640 SDValue Shift =
28641 DAG.getNode(ISD::SRL, DL, MVT::i16,
28642 DAG.getNode(ISD::AND, DL, MVT::i16,
28643 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28644 DAG.getConstant(9, DL, MVT::i8));
28645 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28646
28647 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28648 SDValue RetVal =
28649 DAG.getNode(ISD::AND, DL, MVT::i32,
28650 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28651 DAG.getConstant(3, DL, MVT::i32));
28652
28653 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28654
28655 return DAG.getMergeValues({RetVal, Chain}, DL);
28656}
28657
28658SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28659 SelectionDAG &DAG) const {
28660 MachineFunction &MF = DAG.getMachineFunction();
28661 SDLoc DL(Op);
28662 SDValue Chain = Op.getNode()->getOperand(0);
28663
28664 // FP control word may be set only from data in memory. So we need to allocate
28665 // stack space to save/load FP control word.
28666 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28667 SDValue StackSlot =
28668 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28669 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28670 MachineMemOperand *MMO =
28672
28673 // Store FP control word into memory.
28674 SDValue Ops[] = {Chain, StackSlot};
28675 Chain = DAG.getMemIntrinsicNode(
28676 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28677
28678 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28679 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28680 Chain = CWD.getValue(1);
28681 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28682 DAG.getConstant(0xf3ff, DL, MVT::i16));
28683
28684 // Calculate new rounding mode.
28685 SDValue NewRM = Op.getNode()->getOperand(1);
28686 SDValue RMBits;
28687 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28688 uint64_t RM = CVal->getZExtValue();
28689 int FieldVal;
28690 switch (static_cast<RoundingMode>(RM)) {
28691 // clang-format off
28692 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
28693 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
28694 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
28695 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
28696 default:
28697 llvm_unreachable("rounding mode is not supported by X86 hardware");
28698 // clang-format on
28699 }
28700 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28701 } else {
28702 // Need to convert argument into bits of control word:
28703 // 0 Round to 0 -> 11
28704 // 1 Round to nearest -> 00
28705 // 2 Round to +inf -> 10
28706 // 3 Round to -inf -> 01
28707 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28708 // To make the conversion, put all these values into a value 0xc9 and shift
28709 // it left depending on the rounding mode:
28710 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28711 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28712 // ...
28713 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28714 SDValue ShiftValue =
28715 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28716 DAG.getNode(ISD::ADD, DL, MVT::i32,
28717 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28718 DAG.getConstant(1, DL, MVT::i8)),
28719 DAG.getConstant(4, DL, MVT::i32)));
28720 SDValue Shifted =
28721 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28722 ShiftValue);
28723 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28724 DAG.getConstant(0xc00, DL, MVT::i16));
28725 }
28726
28727 // Update rounding mode bits and store the new FP Control Word into stack.
28728 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28729 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28730
28731 // Load FP control word from the slot.
28732 SDValue OpsLD[] = {Chain, StackSlot};
28733 MachineMemOperand *MMOL =
28735 Chain = DAG.getMemIntrinsicNode(
28736 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28737
28738 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28739 // same way but in bits 14:13.
28740 if (Subtarget.hasSSE1()) {
28741 // Store MXCSR into memory.
28742 Chain = DAG.getNode(
28743 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28744 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28745 StackSlot);
28746
28747 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28748 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28749 Chain = CWD.getValue(1);
28750 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28751 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28752
28753 // Shift X87 RM bits from 11:10 to 14:13.
28754 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28755 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28756 DAG.getConstant(3, DL, MVT::i8));
28757
28758 // Update rounding mode bits and store the new FP Control Word into stack.
28759 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28760 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28761
28762 // Load MXCSR from the slot.
28763 Chain = DAG.getNode(
28764 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28765 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28766 StackSlot);
28767 }
28768
28769 return Chain;
28770}
28771
28772const unsigned X87StateSize = 28;
28773const unsigned FPStateSize = 32;
28774[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28775
28776SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28777 SelectionDAG &DAG) const {
28779 SDLoc DL(Op);
28780 SDValue Chain = Op->getOperand(0);
28781 SDValue Ptr = Op->getOperand(1);
28783 EVT MemVT = Node->getMemoryVT();
28785 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28786
28787 // Get x87 state, if it presents.
28788 if (Subtarget.hasX87()) {
28789 Chain =
28790 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28791 {Chain, Ptr}, MemVT, MMO);
28792
28793 // FNSTENV changes the exception mask, so load back the stored environment.
28794 MachineMemOperand::Flags NewFlags =
28797 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28798 Chain =
28799 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28800 {Chain, Ptr}, MemVT, MMO);
28801 }
28802
28803 // If target supports SSE, get MXCSR as well.
28804 if (Subtarget.hasSSE1()) {
28805 // Get pointer to the MXCSR location in memory.
28807 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28808 DAG.getConstant(X87StateSize, DL, PtrVT));
28809 // Store MXCSR into memory.
28810 Chain = DAG.getNode(
28811 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28812 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28813 MXCSRAddr);
28814 }
28815
28816 return Chain;
28817}
28818
28820 EVT MemVT, MachineMemOperand *MMO,
28821 SelectionDAG &DAG,
28822 const X86Subtarget &Subtarget) {
28823 // Set x87 state, if it presents.
28824 if (Subtarget.hasX87())
28825 Chain =
28826 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28827 {Chain, Ptr}, MemVT, MMO);
28828 // If target supports SSE, set MXCSR as well.
28829 if (Subtarget.hasSSE1()) {
28830 // Get pointer to the MXCSR location in memory.
28832 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28833 DAG.getConstant(X87StateSize, DL, PtrVT));
28834 // Load MXCSR from memory.
28835 Chain = DAG.getNode(
28836 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28837 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28838 MXCSRAddr);
28839 }
28840 return Chain;
28841}
28842
28843SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28844 SelectionDAG &DAG) const {
28845 SDLoc DL(Op);
28846 SDValue Chain = Op->getOperand(0);
28847 SDValue Ptr = Op->getOperand(1);
28849 EVT MemVT = Node->getMemoryVT();
28851 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28852 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28853}
28854
28855SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28856 SelectionDAG &DAG) const {
28857 MachineFunction &MF = DAG.getMachineFunction();
28858 SDLoc DL(Op);
28859 SDValue Chain = Op.getNode()->getOperand(0);
28860
28861 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28862 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28864
28865 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28866 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28867 // for compatibility with glibc.
28868 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28869 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28870 Constant *Zero = ConstantInt::get(ItemTy, 0);
28871 for (unsigned I = 0; I < 6; ++I)
28872 FPEnvVals.push_back(Zero);
28873
28874 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28875 // all exceptions, sets DAZ and FTZ to 0.
28876 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28877 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28878 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28879 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28880 MachinePointerInfo MPI =
28882 MachineMemOperand *MMO = MF.getMachineMemOperand(
28884
28885 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28886}
28887
28888// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28889uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28890 assert((Amt < 8) && "Shift/Rotation amount out of range");
28891 switch (Opcode) {
28892 case ISD::BITREVERSE:
28893 return 0x8040201008040201ULL;
28894 case ISD::SHL:
28895 return ((0x0102040810204080ULL >> (Amt)) &
28896 (0x0101010101010101ULL * (0xFF >> (Amt))));
28897 case ISD::SRL:
28898 return ((0x0102040810204080ULL << (Amt)) &
28899 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28900 case ISD::SRA:
28901 return (getGFNICtrlImm(ISD::SRL, Amt) |
28902 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28903 case ISD::ROTL:
28904 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28905 case ISD::ROTR:
28906 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28907 }
28908 llvm_unreachable("Unsupported GFNI opcode");
28909}
28910
28911// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28912SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28913 MVT VT, unsigned Amt = 0) {
28914 assert(VT.getVectorElementType() == MVT::i8 &&
28915 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28916 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28917 SmallVector<SDValue> MaskBits;
28918 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28919 uint64_t Bits = (Imm >> (I % 64)) & 255;
28920 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28921 }
28922 return DAG.getBuildVector(VT, DL, MaskBits);
28923}
28924
28925/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28926//
28927// i8/i16 vector implemented using dword LZCNT vector instruction
28928// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28929// split the vector, perform operation on it's Lo a Hi part and
28930// concatenate the results.
28932 const X86Subtarget &Subtarget) {
28933 assert(Op.getOpcode() == ISD::CTLZ);
28934 SDLoc dl(Op);
28935 MVT VT = Op.getSimpleValueType();
28936 MVT EltVT = VT.getVectorElementType();
28937 unsigned NumElems = VT.getVectorNumElements();
28938
28939 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28940 "Unsupported element type");
28941
28942 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28943 if (NumElems > 16 ||
28944 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28945 return splitVectorIntUnary(Op, DAG, dl);
28946
28947 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28948 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28949 "Unsupported value type for operation");
28950
28951 // Use native supported vector instruction vplzcntd.
28952 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28953 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28954 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28955 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28956
28957 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28958}
28959
28960// Lower CTLZ using a PSHUFB lookup table implementation.
28962 const X86Subtarget &Subtarget,
28963 SelectionDAG &DAG) {
28964 MVT VT = Op.getSimpleValueType();
28965 int NumElts = VT.getVectorNumElements();
28966 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28967 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
28968
28969 // Per-nibble leading zero PSHUFB lookup table.
28970 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
28971 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
28972 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
28973 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
28974
28976 for (int i = 0; i < NumBytes; ++i)
28977 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28978 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
28979
28980 // Begin by bitcasting the input to byte vector, then split those bytes
28981 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
28982 // If the hi input nibble is zero then we add both results together, otherwise
28983 // we just take the hi result (by masking the lo result to zero before the
28984 // add).
28985 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
28986 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
28987
28988 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
28989 SDValue Lo = Op0;
28990 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
28991 SDValue HiZ;
28992 if (CurrVT.is512BitVector()) {
28993 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
28994 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
28995 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
28996 } else {
28997 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
28998 }
28999
29000 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29001 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29002 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29003 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29004
29005 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29006 // of the current vector width in the same way we did for the nibbles.
29007 // If the upper half of the input element is zero then add the halves'
29008 // leading zero counts together, otherwise just use the upper half's.
29009 // Double the width of the result until we are at target width.
29010 while (CurrVT != VT) {
29011 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29012 int CurrNumElts = CurrVT.getVectorNumElements();
29013 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29014 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29015 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29016
29017 // Check if the upper half of the input element is zero.
29018 if (CurrVT.is512BitVector()) {
29019 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29020 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29021 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29022 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29023 } else {
29024 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29025 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29026 }
29027 HiZ = DAG.getBitcast(NextVT, HiZ);
29028
29029 // Move the upper/lower halves to the lower bits as we'll be extending to
29030 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29031 // together.
29032 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29033 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29034 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29035 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29036 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29037 CurrVT = NextVT;
29038 }
29039
29040 return Res;
29041}
29042
29044 const X86Subtarget &Subtarget,
29045 SelectionDAG &DAG) {
29046 MVT VT = Op.getSimpleValueType();
29047
29048 if (Subtarget.hasCDI() &&
29049 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29050 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29051 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29052
29053 // Decompose 256-bit ops into smaller 128-bit ops.
29054 if (VT.is256BitVector() && !Subtarget.hasInt256())
29055 return splitVectorIntUnary(Op, DAG, DL);
29056
29057 // Decompose 512-bit ops into smaller 256-bit ops.
29058 if (VT.is512BitVector() && !Subtarget.hasBWI())
29059 return splitVectorIntUnary(Op, DAG, DL);
29060
29061 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29062 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29063}
29064
29066 SelectionDAG &DAG,
29067 const X86Subtarget &Subtarget) {
29068 MVT VT = Op.getSimpleValueType();
29069 SDValue Input = Op.getOperand(0);
29070
29071 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29072 "Expected vXi8 input for GFNI-based CTLZ lowering");
29073
29074 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29075
29076 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29077 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29078
29079 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29080 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29081 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29082
29083 SDValue LZCNT =
29084 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29085 DAG.getTargetConstant(8, DL, MVT::i8));
29086 return LZCNT;
29087}
29088
29089static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29090 SelectionDAG &DAG) {
29091 MVT VT = Op.getSimpleValueType();
29092 MVT OpVT = VT;
29093 unsigned NumBits = VT.getSizeInBits();
29094 SDLoc dl(Op);
29095 unsigned Opc = Op.getOpcode();
29096
29097 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29098 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29099
29100 if (VT.isVector())
29101 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29102
29103 Op = Op.getOperand(0);
29104 if (VT == MVT::i8) {
29105 // Zero extend to i32 since there is not an i8 bsr.
29106 OpVT = MVT::i32;
29107 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29108 }
29109
29110 // Check if we can safely pass a result though BSR for zero sources.
29111 SDValue PassThru = DAG.getUNDEF(OpVT);
29112 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29113 !DAG.isKnownNeverZero(Op))
29114 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29115
29116 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29117 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29118 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29119
29120 // Skip CMOV if we're using a pass through value.
29121 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29122 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29123 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29124 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29125 Op.getValue(1)};
29126 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29127 }
29128
29129 // Finally xor with NumBits-1.
29130 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29131 DAG.getConstant(NumBits - 1, dl, OpVT));
29132
29133 if (VT == MVT::i8)
29134 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29135 return Op;
29136}
29137
29138static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29139 SelectionDAG &DAG) {
29140 MVT VT = Op.getSimpleValueType();
29141 unsigned NumBits = VT.getScalarSizeInBits();
29142 SDValue N0 = Op.getOperand(0);
29143 SDLoc dl(Op);
29144 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29145
29146 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29147 "Only scalar CTTZ requires custom lowering");
29148
29149 // Check if we can safely pass a result though BSF for zero sources.
29150 SDValue PassThru = DAG.getUNDEF(VT);
29151 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29152 PassThru = DAG.getConstant(NumBits, dl, VT);
29153
29154 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29155 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29156 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29157
29158 // Skip CMOV if src is never zero or we're using a pass through value.
29159 if (NonZeroSrc || !PassThru.isUndef())
29160 return Op;
29161
29162 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29163 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29164 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29165 Op.getValue(1)};
29166 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29167}
29168
29170 const X86Subtarget &Subtarget) {
29171 MVT VT = Op.getSimpleValueType();
29172 SDLoc DL(Op);
29173
29174 if (VT == MVT::i16 || VT == MVT::i32)
29175 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29176
29177 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29178 return splitVectorIntBinary(Op, DAG, DL);
29179
29180 assert(Op.getSimpleValueType().is256BitVector() &&
29181 Op.getSimpleValueType().isInteger() &&
29182 "Only handle AVX 256-bit vector integer operation");
29183 return splitVectorIntBinary(Op, DAG, DL);
29184}
29185
29187 const X86Subtarget &Subtarget) {
29188 MVT VT = Op.getSimpleValueType();
29189 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29190 unsigned Opcode = Op.getOpcode();
29191 SDLoc DL(Op);
29192
29193 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29194 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29195 assert(Op.getSimpleValueType().isInteger() &&
29196 "Only handle AVX vector integer operation");
29197 return splitVectorIntBinary(Op, DAG, DL);
29198 }
29199
29200 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29201 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29202 EVT SetCCResultType =
29203 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29204
29205 unsigned BitWidth = VT.getScalarSizeInBits();
29206 if (Opcode == ISD::USUBSAT) {
29207 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29208 // Handle a special-case with a bit-hack instead of cmp+select:
29209 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29210 // If the target can use VPTERNLOG, DAGToDAG will match this as
29211 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29212 // "broadcast" constant load.
29214 if (C && C->getAPIntValue().isSignMask()) {
29215 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29216 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29217 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29218 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29219 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29220 }
29221 }
29222 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29223 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29224 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29225 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29226 // TODO: Move this to DAGCombiner?
29227 if (SetCCResultType == VT &&
29228 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29229 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29230 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29231 }
29232 }
29233
29234 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29235 (!VT.isVector() || VT == MVT::v2i64)) {
29238 SDValue Zero = DAG.getConstant(0, DL, VT);
29239 SDValue Result =
29240 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29241 DAG.getVTList(VT, SetCCResultType), X, Y);
29242 SDValue SumDiff = Result.getValue(0);
29243 SDValue Overflow = Result.getValue(1);
29244 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29245 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29246 SDValue SumNeg =
29247 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29248 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29249 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29250 }
29251
29252 // Use default expansion.
29253 return SDValue();
29254}
29255
29256static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29257 SelectionDAG &DAG) {
29258 MVT VT = Op.getSimpleValueType();
29259 SDLoc DL(Op);
29260
29261 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29262 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29263 // 8-bit integer abs to NEG and CMOV.
29264 SDValue N0 = Op.getOperand(0);
29265 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29266 DAG.getConstant(0, DL, VT), N0);
29267 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29268 SDValue(Neg.getNode(), 1)};
29269 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29270 }
29271
29272 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29273 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29274 SDValue Src = Op.getOperand(0);
29275 SDValue Neg = DAG.getNegative(Src, DL, VT);
29276 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29277 }
29278
29279 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29280 assert(VT.isInteger() &&
29281 "Only handle AVX 256-bit vector integer operation");
29282 return splitVectorIntUnary(Op, DAG, DL);
29283 }
29284
29285 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29286 return splitVectorIntUnary(Op, DAG, DL);
29287
29288 // Default to expand.
29289 return SDValue();
29290}
29291
29292static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29293 SelectionDAG &DAG) {
29294 MVT VT = Op.getSimpleValueType();
29295 SDLoc DL(Op);
29296
29297 // For AVX1 cases, split to use legal ops.
29298 if (VT.is256BitVector() && !Subtarget.hasInt256())
29299 return splitVectorIntBinary(Op, DAG, DL);
29300
29301 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29302 return splitVectorIntBinary(Op, DAG, DL);
29303
29304 // Default to expand.
29305 return SDValue();
29306}
29307
29308static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29309 SelectionDAG &DAG) {
29310 MVT VT = Op.getSimpleValueType();
29311 SDLoc DL(Op);
29312
29313 // For AVX1 cases, split to use legal ops.
29314 if (VT.is256BitVector() && !Subtarget.hasInt256())
29315 return splitVectorIntBinary(Op, DAG, DL);
29316
29317 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29318 return splitVectorIntBinary(Op, DAG, DL);
29319
29320 // Default to expand.
29321 return SDValue();
29322}
29323
29325 SelectionDAG &DAG) {
29326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29327 EVT VT = Op.getValueType();
29328 SDValue X = Op.getOperand(0);
29329 SDValue Y = Op.getOperand(1);
29330 SDLoc DL(Op);
29331 bool IsMaxOp =
29332 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29333 bool IsNum =
29334 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29335 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29336 unsigned Opc = 0;
29337 if (VT.isVector())
29339 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29341
29342 if (Opc) {
29343 SDValue Imm =
29344 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29345 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29346 }
29347 }
29348
29349 uint64_t SizeInBits = VT.getScalarSizeInBits();
29350 APInt PreferredZero = APInt::getZero(SizeInBits);
29351 APInt OppositeZero = PreferredZero;
29352 EVT IVT = VT.changeTypeToInteger();
29353 X86ISD::NodeType MinMaxOp;
29354 if (IsMaxOp) {
29355 MinMaxOp = X86ISD::FMAX;
29356 OppositeZero.setSignBit();
29357 } else {
29358 PreferredZero.setSignBit();
29359 MinMaxOp = X86ISD::FMIN;
29360 }
29361 EVT SetCCType =
29362 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29363
29364 // The tables below show the expected result of Max in cases of NaN and
29365 // signed zeros.
29366 //
29367 // Y Y
29368 // Num xNaN +0 -0
29369 // --------------- ---------------
29370 // Num | Max | Y | +0 | +0 | +0 |
29371 // X --------------- X ---------------
29372 // xNaN | X | X/Y | -0 | +0 | -0 |
29373 // --------------- ---------------
29374 //
29375 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29376 // reordering.
29377 //
29378 // We check if any of operands is NaN and return NaN. Then we check if any of
29379 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29380 // to ensure the correct zero is returned.
29381 auto MatchesZero = [](SDValue Op, APInt Zero) {
29383 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29384 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29385 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29386 return CstOp->getAPIntValue() == Zero;
29387 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29388 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29389 for (const SDValue &OpVal : Op->op_values()) {
29390 if (OpVal.isUndef())
29391 continue;
29392 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29393 if (!CstOp)
29394 return false;
29395 if (!CstOp->getValueAPF().isZero())
29396 continue;
29397 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29398 return false;
29399 }
29400 return true;
29401 }
29402 return false;
29403 };
29404
29405 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29406 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29407 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29408 Op->getFlags().hasNoSignedZeros() ||
29409 DAG.isKnownNeverZeroFloat(X) ||
29411 SDValue NewX, NewY;
29412 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29413 MatchesZero(X, OppositeZero)) {
29414 // Operands are already in right order or order does not matter.
29415 NewX = X;
29416 NewY = Y;
29417 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29418 NewX = Y;
29419 NewY = X;
29420 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29421 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29422 if (IsXNeverNaN)
29423 std::swap(X, Y);
29424 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29425 // xmm register.
29426 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29428 // Bits of classes:
29429 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29430 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29431 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29432 DL, MVT::i32);
29433 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29434 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29435 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29436 DAG.getVectorIdxConstant(0, DL));
29437 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29438 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29439 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29440 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29441 } else {
29442 SDValue IsXSigned;
29443 if (Subtarget.is64Bit() || VT != MVT::f64) {
29444 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29445 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29446 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29447 } else {
29448 assert(VT == MVT::f64);
29449 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29450 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29451 DAG.getVectorIdxConstant(0, DL));
29452 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29453 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29454 DAG.getVectorIdxConstant(1, DL));
29455 Hi = DAG.getBitcast(MVT::i32, Hi);
29456 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29457 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29458 *DAG.getContext(), MVT::i32);
29459 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29460 }
29461 if (MinMaxOp == X86ISD::FMAX) {
29462 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29463 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29464 } else {
29465 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29466 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29467 }
29468 }
29469
29470 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29471 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29472
29473 // If we did no ordering operands for signed zero handling and we need
29474 // to process NaN and we know that one of the operands is not NaN then:
29475 // - For minimum/maximum, put it in the first operand,
29476 // - For minimumnum/maximumnum, put it in the second operand,
29477 // and we will not need to post handle NaN after max/min.
29478 if (IgnoreSignedZero && !IgnoreNaN &&
29479 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29480 std::swap(NewX, NewY);
29481
29482 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29483
29484 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29485 return MinMax;
29486
29487 if (DAG.isKnownNeverNaN(NewX))
29488 NewX = NewY;
29489
29490 SDValue IsNaN =
29491 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29492
29493 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29494}
29495
29496static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29497 SelectionDAG &DAG) {
29498 MVT VT = Op.getSimpleValueType();
29499 SDLoc dl(Op);
29500
29501 // For AVX1 cases, split to use legal ops.
29502 if (VT.is256BitVector() && !Subtarget.hasInt256())
29503 return splitVectorIntBinary(Op, DAG, dl);
29504
29505 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29506 return splitVectorIntBinary(Op, DAG, dl);
29507
29508 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29509 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29510
29511 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29512 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29513 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29514
29515 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29516 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29517 if (VT.bitsGE(MVT::i32)) {
29518 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29519 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29520 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29521 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29522 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29523 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29524 DAG.getTargetConstant(CC, dl, MVT::i8),
29525 Diff1.getValue(1));
29526 }
29527
29528 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29529 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29530 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29531 MVT WideVT = MVT::getIntegerVT(WideBits);
29532 if (TLI.isTypeLegal(WideVT)) {
29533 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29534 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29535 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29536 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29537 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29538 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29539 DAG.getTargetConstant(CC, dl, MVT::i8),
29540 Diff1.getValue(1));
29541 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29542 }
29543 }
29544
29545 // Default to expand.
29546 return SDValue();
29547}
29548
29549static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29550 SelectionDAG &DAG) {
29551 SDLoc dl(Op);
29552 MVT VT = Op.getSimpleValueType();
29553
29554 // Decompose 256-bit ops into 128-bit ops.
29555 if (VT.is256BitVector() && !Subtarget.hasInt256())
29556 return splitVectorIntBinary(Op, DAG, dl);
29557
29558 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29559 return splitVectorIntBinary(Op, DAG, dl);
29560
29561 SDValue A = Op.getOperand(0);
29562 SDValue B = Op.getOperand(1);
29563
29564 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29565 // vector pairs, multiply and truncate.
29566 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29567 unsigned NumElts = VT.getVectorNumElements();
29568 unsigned NumLanes = VT.getSizeInBits() / 128;
29569 unsigned NumEltsPerLane = NumElts / NumLanes;
29570
29571 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29572 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29573 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29574 return DAG.getNode(
29575 ISD::TRUNCATE, dl, VT,
29576 DAG.getNode(ISD::MUL, dl, ExVT,
29577 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29578 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29579 }
29580
29581 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29582
29583 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29584 // Don't do this if we only need to unpack one half.
29585 if (Subtarget.hasSSSE3()) {
29586 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29587 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29588 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29589 if (BIsBuildVector) {
29590 for (auto [Idx, Val] : enumerate(B->ops())) {
29591 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29592 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29593 else
29594 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29595 }
29596 }
29597 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29598 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29599 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29600 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29601 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29602 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29603 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29604 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29605 DAG.getTargetConstant(8, dl, MVT::i8));
29606 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29607 }
29608 }
29609
29610 // Extract the lo/hi parts to any extend to i16.
29611 // We're going to mask off the low byte of each result element of the
29612 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29613 // element.
29614 SDValue Undef = DAG.getUNDEF(VT);
29615 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29616 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29617
29618 SDValue BLo, BHi;
29619 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29620 // If the RHS is a constant, manually unpackl/unpackh.
29621 SmallVector<SDValue, 16> LoOps, HiOps;
29622 for (unsigned i = 0; i != NumElts; i += 16) {
29623 for (unsigned j = 0; j != 8; ++j) {
29624 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29625 MVT::i16));
29626 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29627 MVT::i16));
29628 }
29629 }
29630
29631 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29632 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29633 } else {
29634 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29635 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29636 }
29637
29638 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29639 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29640 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29641 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29642 }
29643
29644 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29645 if (VT == MVT::v4i32) {
29646 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29647 "Should not custom lower when pmulld is available!");
29648
29649 // Extract the odd parts.
29650 static const int UnpackMask[] = {1, 1, 3, 3};
29651 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29652 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29653
29654 // Multiply the even parts.
29655 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29656 DAG.getBitcast(MVT::v2i64, A),
29657 DAG.getBitcast(MVT::v2i64, B));
29658 // Now multiply odd parts.
29659 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29660 DAG.getBitcast(MVT::v2i64, Aodds),
29661 DAG.getBitcast(MVT::v2i64, Bodds));
29662
29663 Evens = DAG.getBitcast(VT, Evens);
29664 Odds = DAG.getBitcast(VT, Odds);
29665
29666 // Merge the two vectors back together with a shuffle. This expands into 2
29667 // shuffles.
29668 static const int ShufMask[] = { 0, 4, 2, 6 };
29669 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29670 }
29671
29672 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29673 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29674 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29675
29676 // Ahi = psrlqi(a, 32);
29677 // Bhi = psrlqi(b, 32);
29678 //
29679 // AloBlo = pmuludq(a, b);
29680 // AloBhi = pmuludq(a, Bhi);
29681 // AhiBlo = pmuludq(Ahi, b);
29682 //
29683 // Hi = psllqi(AloBhi + AhiBlo, 32);
29684 // return AloBlo + Hi;
29685 KnownBits AKnown = DAG.computeKnownBits(A);
29686 KnownBits BKnown = DAG.computeKnownBits(B);
29687
29688 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29689 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29690 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29691
29692 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29693 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29694 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29695
29696 SDValue Zero = DAG.getConstant(0, dl, VT);
29697
29698 // Only multiply lo/hi halves that aren't known to be zero.
29699 SDValue AloBlo = Zero;
29700 if (!ALoIsZero && !BLoIsZero)
29701 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29702
29703 SDValue AloBhi = Zero;
29704 if (!ALoIsZero && !BHiIsZero) {
29705 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29706 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29707 }
29708
29709 SDValue AhiBlo = Zero;
29710 if (!AHiIsZero && !BLoIsZero) {
29711 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29712 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29713 }
29714
29715 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29716 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29717
29718 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29719}
29720
29722 MVT VT, bool IsSigned,
29723 const X86Subtarget &Subtarget,
29724 SelectionDAG &DAG,
29725 SDValue *Low = nullptr) {
29726 unsigned NumElts = VT.getVectorNumElements();
29727
29728 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29729 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29730 // lane results back together.
29731
29732 // We'll take different approaches for signed and unsigned.
29733 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
29734 // and use pmullw to calculate the full 16-bit product.
29735 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29736 // shift them left into the upper byte of each word. This allows us to use
29737 // pmulhw to calculate the full 16-bit product. This trick means we don't
29738 // need to sign extend the bytes to use pmullw.
29739
29740 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29741 SDValue Zero = DAG.getConstant(0, dl, VT);
29742
29743 SDValue ALo, AHi;
29744 if (IsSigned) {
29745 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29746 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29747 } else {
29748 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29749 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29750 }
29751
29752 SDValue BLo, BHi;
29753 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29754 // If the RHS is a constant, manually unpackl/unpackh and extend.
29755 SmallVector<SDValue, 16> LoOps, HiOps;
29756 for (unsigned i = 0; i != NumElts; i += 16) {
29757 for (unsigned j = 0; j != 8; ++j) {
29758 SDValue LoOp = B.getOperand(i + j);
29759 SDValue HiOp = B.getOperand(i + j + 8);
29760
29761 if (IsSigned) {
29762 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
29763 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
29764 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
29765 DAG.getConstant(8, dl, MVT::i16));
29766 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
29767 DAG.getConstant(8, dl, MVT::i16));
29768 } else {
29769 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
29770 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
29771 }
29772
29773 LoOps.push_back(LoOp);
29774 HiOps.push_back(HiOp);
29775 }
29776 }
29777
29778 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29779 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29780 } else if (IsSigned) {
29781 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29782 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29783 } else {
29784 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29785 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29786 }
29787
29788 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29789 // pack back to vXi8.
29790 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29791 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29792 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29793
29794 if (Low)
29795 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29796
29797 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
29798}
29799
29800static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29801 SelectionDAG &DAG) {
29802 SDLoc dl(Op);
29803 MVT VT = Op.getSimpleValueType();
29804 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29805 unsigned NumElts = VT.getVectorNumElements();
29806 SDValue A = Op.getOperand(0);
29807 SDValue B = Op.getOperand(1);
29808
29809 // Decompose 256-bit ops into 128-bit ops.
29810 if (VT.is256BitVector() && !Subtarget.hasInt256())
29811 return splitVectorIntBinary(Op, DAG, dl);
29812
29813 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29814 return splitVectorIntBinary(Op, DAG, dl);
29815
29816 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29817 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29818 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29819 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29820
29821 // PMULxD operations multiply each even value (starting at 0) of LHS with
29822 // the related value of RHS and produce a widen result.
29823 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29824 // => <2 x i64> <ae|cg>
29825 //
29826 // In other word, to have all the results, we need to perform two PMULxD:
29827 // 1. one with the even values.
29828 // 2. one with the odd values.
29829 // To achieve #2, with need to place the odd values at an even position.
29830 //
29831 // Place the odd value at an even position (basically, shift all values 1
29832 // step to the left):
29833 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29834 9, -1, 11, -1, 13, -1, 15, -1};
29835 // <a|b|c|d> => <b|undef|d|undef>
29836 SDValue Odd0 =
29837 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29838 // <e|f|g|h> => <f|undef|h|undef>
29839 SDValue Odd1 =
29840 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29841
29842 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29843 // ints.
29844 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29845 unsigned Opcode =
29846 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29847 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29848 // => <2 x i64> <ae|cg>
29849 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29850 DAG.getBitcast(MulVT, A),
29851 DAG.getBitcast(MulVT, B)));
29852 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29853 // => <2 x i64> <bf|dh>
29854 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29855 DAG.getBitcast(MulVT, Odd0),
29856 DAG.getBitcast(MulVT, Odd1)));
29857
29858 // Shuffle it back into the right order.
29859 SmallVector<int, 16> ShufMask(NumElts);
29860 for (int i = 0; i != (int)NumElts; ++i)
29861 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29862
29863 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29864
29865 // If we have a signed multiply but no PMULDQ fix up the result of an
29866 // unsigned multiply.
29867 if (IsSigned && !Subtarget.hasSSE41()) {
29868 SDValue Zero = DAG.getConstant(0, dl, VT);
29869 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29870 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29871 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29872 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29873
29874 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29875 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29876 }
29877
29878 return Res;
29879 }
29880
29881 // Only i8 vectors should need custom lowering after this.
29882 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29883 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29884 "Unsupported vector type");
29885
29886 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29887 // logical shift down the upper half and pack back to i8.
29888
29889 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29890 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29891
29892 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29893 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29894 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29895 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29896 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29897 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29898 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29899 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29900 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29901 }
29902
29903 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29904}
29905
29906// Custom lowering for SMULO/UMULO.
29907static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29908 SelectionDAG &DAG) {
29909 MVT VT = Op.getSimpleValueType();
29910
29911 // Scalars defer to LowerXALUO.
29912 if (!VT.isVector())
29913 return LowerXALUO(Op, DAG);
29914
29915 SDLoc dl(Op);
29916 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29917 SDValue A = Op.getOperand(0);
29918 SDValue B = Op.getOperand(1);
29919 EVT OvfVT = Op->getValueType(1);
29920
29921 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29922 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29923 // Extract the LHS Lo/Hi vectors
29924 SDValue LHSLo, LHSHi;
29925 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29926
29927 // Extract the RHS Lo/Hi vectors
29928 SDValue RHSLo, RHSHi;
29929 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29930
29931 EVT LoOvfVT, HiOvfVT;
29932 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29933 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29934 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29935
29936 // Issue the split operations.
29937 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29938 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29939
29940 // Join the separate data results and the overflow results.
29941 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29942 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29943 Hi.getValue(1));
29944
29945 return DAG.getMergeValues({Res, Ovf}, dl);
29946 }
29947
29948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29949 EVT SetccVT =
29950 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29951
29952 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29953 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29954 unsigned NumElts = VT.getVectorNumElements();
29955 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29956 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29957 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29958 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29959 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29960
29961 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29962
29963 SDValue Ovf;
29964 if (IsSigned) {
29965 SDValue High, LowSign;
29966 if (OvfVT.getVectorElementType() == MVT::i1 &&
29967 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29968 // Rather the truncating try to do the compare on vXi16 or vXi32.
29969 // Shift the high down filling with sign bits.
29970 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29971 // Fill all 16 bits with the sign bit from the low.
29972 LowSign =
29973 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29974 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29975 15, DAG);
29976 SetccVT = OvfVT;
29977 if (!Subtarget.hasBWI()) {
29978 // We can't do a vXi16 compare so sign extend to v16i32.
29979 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29980 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29981 }
29982 } else {
29983 // Otherwise do the compare at vXi8.
29984 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29985 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29986 LowSign =
29987 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29988 }
29989
29990 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29991 } else {
29992 SDValue High =
29993 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29994 if (OvfVT.getVectorElementType() == MVT::i1 &&
29995 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29996 // Rather the truncating try to do the compare on vXi16 or vXi32.
29997 SetccVT = OvfVT;
29998 if (!Subtarget.hasBWI()) {
29999 // We can't do a vXi16 compare so sign extend to v16i32.
30000 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30001 }
30002 } else {
30003 // Otherwise do the compare at vXi8.
30004 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30005 }
30006
30007 Ovf =
30008 DAG.getSetCC(dl, SetccVT, High,
30009 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30010 }
30011
30012 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30013
30014 return DAG.getMergeValues({Low, Ovf}, dl);
30015 }
30016
30017 SDValue Low;
30018 SDValue High =
30019 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30020
30021 SDValue Ovf;
30022 if (IsSigned) {
30023 // SMULO overflows if the high bits don't match the sign of the low.
30024 SDValue LowSign =
30025 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30026 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30027 } else {
30028 // UMULO overflows if the high bits are non-zero.
30029 Ovf =
30030 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30031 }
30032
30033 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30034
30035 return DAG.getMergeValues({Low, Ovf}, dl);
30036}
30037
30038SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30039 assert(Subtarget.isTargetWin64() && "Unexpected target");
30040 EVT VT = Op.getValueType();
30041 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30042 "Unexpected return type for lowering");
30043
30044 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30046 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30047 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30048 }
30049
30050 RTLIB::Libcall LC;
30051 bool isSigned;
30052 switch (Op->getOpcode()) {
30053 // clang-format off
30054 default: llvm_unreachable("Unexpected request for libcall!");
30055 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30056 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30057 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30058 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30059 // clang-format on
30060 }
30061
30062 SDLoc dl(Op);
30063 SDValue InChain = DAG.getEntryNode();
30064
30066 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30067 EVT ArgVT = Op->getOperand(i).getValueType();
30068 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30069 "Unexpected argument type for lowering");
30070 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30071 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30072 MachinePointerInfo MPI =
30074 InChain =
30075 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30076 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30077 }
30078
30081
30082 TargetLowering::CallLoweringInfo CLI(DAG);
30083 CLI.setDebugLoc(dl)
30084 .setChain(InChain)
30085 .setLibCallee(
30087 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30088 std::move(Args))
30089 .setInRegister()
30090 .setSExtResult(isSigned)
30091 .setZExtResult(!isSigned);
30092
30093 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30094 return DAG.getBitcast(VT, CallInfo.first);
30095}
30096
30097SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30098 SelectionDAG &DAG,
30099 SDValue &Chain) const {
30100 assert(Subtarget.isTargetWin64() && "Unexpected target");
30101 EVT VT = Op.getValueType();
30102 bool IsStrict = Op->isStrictFPOpcode();
30103
30104 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30105 EVT ArgVT = Arg.getValueType();
30106
30107 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30108 "Unexpected return type for lowering");
30109
30110 RTLIB::Libcall LC;
30111 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30112 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30113 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30114 else
30115 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30116 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30117
30118 SDLoc dl(Op);
30119 MakeLibCallOptions CallOptions;
30120 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30121
30123 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30124 // expected VT (i128).
30125 std::tie(Result, Chain) =
30126 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30127 Result = DAG.getBitcast(VT, Result);
30128 return Result;
30129}
30130
30131SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30132 SelectionDAG &DAG) const {
30133 assert(Subtarget.isTargetWin64() && "Unexpected target");
30134 EVT VT = Op.getValueType();
30135 bool IsStrict = Op->isStrictFPOpcode();
30136
30137 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30138 EVT ArgVT = Arg.getValueType();
30139
30140 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30141 "Unexpected argument type for lowering");
30142
30143 RTLIB::Libcall LC;
30144 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30145 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30146 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30147 else
30148 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30149 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30150
30151 SDLoc dl(Op);
30152 MakeLibCallOptions CallOptions;
30153 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30154
30155 // Pass the i128 argument as an indirect argument on the stack.
30156 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30157 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30158 MachinePointerInfo MPI =
30160 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30161
30163 std::tie(Result, Chain) =
30164 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30165 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30166}
30167
30168// Return true if the required (according to Opcode) shift-imm form is natively
30169// supported by the Subtarget
30170static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30171 unsigned Opcode) {
30172 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30173 "Unexpected shift opcode");
30174
30175 if (!VT.isSimple())
30176 return false;
30177
30178 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30179 return false;
30180
30181 if (VT.getScalarSizeInBits() < 16)
30182 return false;
30183
30184 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30185 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30186 return true;
30187
30188 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30189 (VT.is256BitVector() && Subtarget.hasInt256());
30190
30191 bool AShift = LShift && (Subtarget.hasAVX512() ||
30192 (VT != MVT::v2i64 && VT != MVT::v4i64));
30193 return (Opcode == ISD::SRA) ? AShift : LShift;
30194}
30195
30196// The shift amount is a variable, but it is the same for all vector lanes.
30197// These instructions are defined together with shift-immediate.
30198static
30200 unsigned Opcode) {
30201 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30202}
30203
30204// Return true if the required (according to Opcode) variable-shift form is
30205// natively supported by the Subtarget
30206static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30207 unsigned Opcode) {
30208 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30209 "Unexpected shift opcode");
30210
30211 if (!VT.isSimple())
30212 return false;
30213
30214 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30215 return false;
30216
30217 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30218 return false;
30219
30220 // vXi16 supported only on AVX-512, BWI
30221 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30222 return false;
30223
30224 if (Subtarget.hasAVX512() &&
30225 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30226 return true;
30227
30228 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30229 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30230 return (Opcode == ISD::SRA) ? AShift : LShift;
30231}
30232
30234 const X86Subtarget &Subtarget) {
30235 MVT VT = Op.getSimpleValueType();
30236 SDLoc dl(Op);
30237 SDValue R = Op.getOperand(0);
30238 SDValue Amt = Op.getOperand(1);
30239 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30240 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30241
30242 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30243 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30244 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30245 SDValue Ex = DAG.getBitcast(ExVT, R);
30246
30247 // ashr(R, 63) === cmp_slt(R, 0)
30248 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30249 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30250 "Unsupported PCMPGT op");
30251 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30252 }
30253
30254 if (ShiftAmt >= 32) {
30255 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30256 SDValue Upper =
30257 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30259 ShiftAmt - 32, DAG);
30260 if (VT == MVT::v2i64)
30261 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30262 if (VT == MVT::v4i64)
30263 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30264 {9, 1, 11, 3, 13, 5, 15, 7});
30265 } else {
30266 // SRA upper i32, SRL whole i64 and select lower i32.
30268 ShiftAmt, DAG);
30269 SDValue Lower =
30270 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30271 Lower = DAG.getBitcast(ExVT, Lower);
30272 if (VT == MVT::v2i64)
30273 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30274 if (VT == MVT::v4i64)
30275 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30276 {8, 1, 10, 3, 12, 5, 14, 7});
30277 }
30278 return DAG.getBitcast(VT, Ex);
30279 };
30280
30281 // Optimize shl/srl/sra with constant shift amount.
30282 APInt APIntShiftAmt;
30283 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30284 return SDValue();
30285
30286 // If the shift amount is out of range, return undef.
30287 if (APIntShiftAmt.uge(EltSizeInBits))
30288 return DAG.getUNDEF(VT);
30289
30290 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30291
30292 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
30293 // Hardware support for vector shifts is sparse which makes us scalarize the
30294 // vector operations in many cases. Also, on sandybridge ADD is faster than
30295 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
30296 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30297 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30298 // must be 0). (add undef, undef) however can be any value. To make this
30299 // safe, we must freeze R to ensure that register allocation uses the same
30300 // register for an undefined value. This ensures that the result will
30301 // still be even and preserves the original semantics.
30302 R = DAG.getFreeze(R);
30303 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30304 }
30305
30306 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30307 }
30308
30309 // i64 SRA needs to be performed as partial shifts.
30310 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30311 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30312 Op.getOpcode() == ISD::SRA)
30313 return ArithmeticShiftRight64(ShiftAmt);
30314
30315 // If we're logical shifting an all-signbits value then we can just perform as
30316 // a mask.
30317 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30318 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30319 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30320 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30321 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30322 }
30323
30324 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30325 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30326 unsigned NumElts = VT.getVectorNumElements();
30327 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30328
30329 // Simple i8 add case
30330 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30331 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30332 // must be 0). (add undef, undef) however can be any value. To make this
30333 // safe, we must freeze R to ensure that register allocation uses the same
30334 // register for an undefined value. This ensures that the result will
30335 // still be even and preserves the original semantics.
30336 R = DAG.getFreeze(R);
30337 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30338 }
30339
30340 // ashr(R, 7) === cmp_slt(R, 0)
30341 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30342 SDValue Zeros = DAG.getConstant(0, dl, VT);
30343 if (VT.is512BitVector()) {
30344 assert(VT == MVT::v64i8 && "Unexpected element type!");
30345 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30346 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30347 }
30348 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30349 }
30350
30351 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30352 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30353 return SDValue();
30354
30355 if (Subtarget.hasGFNI()) {
30356 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30357 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30358 DAG.getTargetConstant(0, dl, MVT::i8));
30359 }
30360
30361 if (Op.getOpcode() == ISD::SHL) {
30362 // Make a large shift.
30363 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30364 ShiftAmt, DAG);
30365 SHL = DAG.getBitcast(VT, SHL);
30366 // Zero out the rightmost bits.
30367 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30368 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30369 }
30370 if (Op.getOpcode() == ISD::SRL) {
30371 // Make a large shift.
30372 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30373 ShiftAmt, DAG);
30374 SRL = DAG.getBitcast(VT, SRL);
30375 // Zero out the leftmost bits.
30376 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30377 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30378 }
30379 if (Op.getOpcode() == ISD::SRA) {
30380 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30381 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30382
30383 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30384 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30385 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30386 return Res;
30387 }
30388 llvm_unreachable("Unknown shift opcode.");
30389 }
30390
30391 return SDValue();
30392}
30393
30395 const X86Subtarget &Subtarget) {
30396 MVT VT = Op.getSimpleValueType();
30397 SDLoc dl(Op);
30398 SDValue R = Op.getOperand(0);
30399 SDValue Amt = Op.getOperand(1);
30400 unsigned Opcode = Op.getOpcode();
30401 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30402
30403 int BaseShAmtIdx = -1;
30404 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30405 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30406 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30407 Subtarget, DAG);
30408
30409 // vXi8 shifts - shift as v8i16 + mask result.
30410 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30411 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30412 VT == MVT::v64i8) &&
30413 !Subtarget.hasXOP()) {
30414 unsigned NumElts = VT.getVectorNumElements();
30415 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30416 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30417 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30418 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30419
30420 // Create the mask using vXi16 shifts. For shift-rights we need to move
30421 // the upper byte down before splatting the vXi8 mask.
30422 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30423 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30424 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30425 if (Opcode != ISD::SHL)
30426 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30427 8, DAG);
30428 BitMask = DAG.getBitcast(VT, BitMask);
30429 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30430 SmallVector<int, 64>(NumElts, 0));
30431
30432 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30433 DAG.getBitcast(ExtVT, R), BaseShAmt,
30434 BaseShAmtIdx, Subtarget, DAG);
30435 Res = DAG.getBitcast(VT, Res);
30436 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30437
30438 if (Opcode == ISD::SRA) {
30439 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30440 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30441 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30442 SignMask =
30443 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30444 BaseShAmtIdx, Subtarget, DAG);
30445 SignMask = DAG.getBitcast(VT, SignMask);
30446 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30447 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30448 }
30449 return Res;
30450 }
30451 }
30452 }
30453
30454 return SDValue();
30455}
30456
30457// Convert a shift/rotate left amount to a multiplication scale factor.
30459 const X86Subtarget &Subtarget,
30460 SelectionDAG &DAG) {
30461 MVT VT = Amt.getSimpleValueType();
30462 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30463 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30464 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30465 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30466 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30467 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30468 return SDValue();
30469
30470 MVT SVT = VT.getVectorElementType();
30471 unsigned SVTBits = SVT.getSizeInBits();
30472 unsigned NumElems = VT.getVectorNumElements();
30473
30474 APInt UndefElts;
30475 SmallVector<APInt> EltBits;
30476 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30477 APInt One(SVTBits, 1);
30478 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30479 for (unsigned I = 0; I != NumElems; ++I) {
30480 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30481 continue;
30482 uint64_t ShAmt = EltBits[I].getZExtValue();
30483 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30484 }
30485 return DAG.getBuildVector(VT, dl, Elts);
30486 }
30487
30488 // If the target doesn't support variable shifts, use either FP conversion
30489 // or integer multiplication to avoid shifting each element individually.
30490 if (VT == MVT::v4i32) {
30491 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30492 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30493 DAG.getConstant(0x3f800000U, dl, VT));
30494 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30495 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30496 }
30497
30498 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30499 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30500 SDValue Z = DAG.getConstant(0, dl, VT);
30501 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30502 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30503 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30504 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30505 if (Subtarget.hasSSE41())
30506 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30507 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30508 }
30509
30510 return SDValue();
30511}
30512
30513static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30514 SelectionDAG &DAG) {
30515 MVT VT = Op.getSimpleValueType();
30516 SDLoc dl(Op);
30517 SDValue R = Op.getOperand(0);
30518 SDValue Amt = Op.getOperand(1);
30519 unsigned NumElts = VT.getVectorNumElements();
30520 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30521 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30522
30523 unsigned Opc = Op.getOpcode();
30524 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30525 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30526
30527 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30528 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30529
30530 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30531 return V;
30532
30533 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30534 return V;
30535
30536 if (supportedVectorVarShift(VT, Subtarget, Opc))
30537 return Op;
30538
30539 // i64 vector arithmetic shift can be emulated with the transform:
30540 // M = lshr(SIGN_MASK, Amt)
30541 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30542 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30543 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30544 Opc == ISD::SRA) {
30545 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30546 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30547 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30548 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30549 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30550 return R;
30551 }
30552
30553 // XOP has 128-bit variable logical/arithmetic shifts.
30554 // +ve/-ve Amt = shift left/right.
30555 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30556 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30557 if (Opc == ISD::SRL || Opc == ISD::SRA)
30558 Amt = DAG.getNegative(Amt, dl, VT);
30559 if (Opc == ISD::SHL || Opc == ISD::SRL)
30560 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30561 if (Opc == ISD::SRA)
30562 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30563 }
30564
30565 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30566 // shifts per-lane and then shuffle the partial results back together.
30567 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30568 // Splat the shift amounts so the scalar shifts above will catch it.
30569 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30570 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30571 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30572 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30573 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30574 }
30575
30576 // Build a map of inrange constant amounts with element mask where they occur.
30578 if (ConstantAmt) {
30579 for (unsigned I = 0; I != NumElts; ++I) {
30580 SDValue A = Amt.getOperand(I);
30581 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30582 continue;
30583 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30584 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30585 if (!Inserted) {
30586 It->second.setBit(I);
30587 continue;
30588 }
30589 It->second = APInt::getOneBitSet(NumElts, I);
30590 }
30591 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30592 }
30593
30594 // If possible, lower this shift as a sequence of two shifts by
30595 // constant plus a BLENDing shuffle instead of scalarizing it.
30596 // Example:
30597 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30598 //
30599 // Could be rewritten as:
30600 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30601 //
30602 // The advantage is that the two shifts from the example would be
30603 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30604 if (UniqueCstAmt.size() == 2 &&
30605 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30606 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30607 unsigned AmtA = UniqueCstAmt.begin()->first;
30608 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30609 const APInt &MaskA = UniqueCstAmt.begin()->second;
30610 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30611 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30612 for (unsigned I = 0; I != NumElts; ++I) {
30613 if (MaskA[I])
30614 ShuffleMask[I] = I;
30615 if (MaskB[I])
30616 ShuffleMask[I] = I + NumElts;
30617 }
30618
30619 // Only perform this blend if we can perform it without loading a mask.
30620 if ((VT != MVT::v16i16 ||
30621 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30622 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30623 canWidenShuffleElements(ShuffleMask))) {
30624 SDValue Shift1 =
30625 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30626 SDValue Shift2 =
30627 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30628 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30629 }
30630 }
30631
30632 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30633 // using vYiM vector operations where X*N == Y*M and M > N.
30634 if (ConstantAmt &&
30635 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30636 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30637 !Subtarget.hasXOP()) {
30638 MVT NarrowScalarVT = VT.getScalarType();
30639 // We can do this extra fast if each pair of narrow elements is shifted by
30640 // the same amount by doing this SWAR style: use a shift to move the valid
30641 // bits to the right position, mask out any bits which crossed from one
30642 // element to the other.
30643 // This optimized lowering is only valid if the elements in a pair can
30644 // be treated identically.
30645 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30646 SmallVector<SDValue, 32> TmpAmtWideElts;
30647 int WideEltSizeInBits = EltSizeInBits;
30648 while (WideEltSizeInBits < 32) {
30649 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30650 // unprofitable.
30651 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30652 break;
30653 }
30654 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30655 bool SameShifts = true;
30656 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30657 unsigned DstI = SrcI / 2;
30658 // Both elements are undef? Make a note and keep going.
30659 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30660 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30661 continue;
30662 }
30663 // Even element is undef? We will shift it by the same shift amount as
30664 // the odd element.
30665 if (AmtWideElts[SrcI].isUndef()) {
30666 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30667 continue;
30668 }
30669 // Odd element is undef? We will shift it by the same shift amount as
30670 // the even element.
30671 if (AmtWideElts[SrcI + 1].isUndef()) {
30672 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30673 continue;
30674 }
30675 // Both elements are equal.
30676 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30677 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30678 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30679 continue;
30680 }
30681 // One of the provisional wide elements will not have the same shift
30682 // amount. Let's bail.
30683 SameShifts = false;
30684 break;
30685 }
30686 if (!SameShifts) {
30687 break;
30688 }
30689 WideEltSizeInBits *= 2;
30690 std::swap(TmpAmtWideElts, AmtWideElts);
30691 }
30692 APInt APIntShiftAmt;
30693 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30694 bool Profitable = WidenShift;
30695 // AVX512BW brings support for vpsllvw.
30696 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30697 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30698 Profitable = false;
30699 }
30700 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30701 // fairly cheaply in other ways.
30702 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30703 Profitable = false;
30704 }
30705 // Leave it up to GFNI if we have it around.
30706 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30707 // is probably a win to use other strategies in some cases.
30708 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30709 Profitable = false;
30710 }
30711
30712 // AVX1 does not have vpand which makes our masking impractical. It does
30713 // have vandps but that is an FP instruction and crossing FP<->int typically
30714 // has some cost.
30715 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30716 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30717 Profitable = false;
30718 }
30719 unsigned WideNumElts = AmtWideElts.size();
30720 // We are only dealing with identical pairs.
30721 if (Profitable && WideNumElts != NumElts) {
30722 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30723 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30724 // Cast the operand to vXiM.
30725 SDValue RWide = DAG.getBitcast(WideVT, R);
30726 // Create our new vector of shift amounts.
30727 SDValue AmtWide = DAG.getBuildVector(
30728 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30729 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30730 // Perform the actual shift.
30731 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30732 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30733 // Now we need to construct a mask which will "drop" bits that get
30734 // shifted past the LSB/MSB. For a logical shift left, it will look
30735 // like:
30736 // FullMask = (1 << EltSizeInBits) - 1
30737 // Mask = FullMask << Amt
30738 //
30739 // This masking ensures that bits cannot migrate from one narrow lane to
30740 // another. The construction of this mask will be constant folded.
30741 // The mask for a logical right shift is nearly identical, the only
30742 // difference is that the all ones mask is shifted right instead of left.
30743 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30744 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30745 Mask = DAG.getBitcast(WideVT, Mask);
30746 // Finally, we mask the shifted vector with the SWAR mask.
30747 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30748 Masked = DAG.getBitcast(VT, Masked);
30749 if (Opc != ISD::SRA) {
30750 // Logical shifts are complete at this point.
30751 return Masked;
30752 }
30753 // At this point, we have done a *logical* shift right. We now need to
30754 // sign extend the result so that we get behavior equivalent to an
30755 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30756 // are `EltSizeInBits-AmtWide` bits wide.
30757 //
30758 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30759 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30760 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30761 // can use the following trick to accomplish this:
30762 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30763 // (Masked ^ SignBitMask) - SignBitMask
30764 //
30765 // When the sign bit is already clear, this will compute:
30766 // Masked + SignBitMask - SignBitMask
30767 //
30768 // This is equal to Masked which is what we want: the sign bit was clear
30769 // so sign extending should be a no-op.
30770 //
30771 // When the sign bit is set, this will compute:
30772 // Masked - SignBitmask - SignBitMask
30773 //
30774 // This is equal to Masked - 2*SignBitMask which will correctly sign
30775 // extend our result.
30776 SDValue SplatHighBit =
30777 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30778 // This does not induce recursion, all operands are constants.
30779 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30780 SDValue FlippedSignBit =
30781 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30782 SDValue Subtraction =
30783 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30784 return Subtraction;
30785 }
30786 }
30787
30788 // If possible, lower this packed shift into a vector multiply instead of
30789 // expanding it into a sequence of scalar shifts.
30790 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30791 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30792 Subtarget.canExtendTo512BW())))
30793 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30794 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30795
30796 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30797 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30798 if (Opc == ISD::SRL && ConstantAmt &&
30799 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30800 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30801 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30802 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30803 SDValue Zero = DAG.getConstant(0, dl, VT);
30804 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30805 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30806 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30807 }
30808 }
30809
30810 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30811 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30812 // TODO: Special case handling for shift by 0/1, really we can afford either
30813 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30814 if (Opc == ISD::SRA && ConstantAmt &&
30815 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30816 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30817 !Subtarget.hasAVX512()) ||
30818 DAG.isKnownNeverZero(Amt))) {
30819 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30820 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30821 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30822 SDValue Amt0 =
30823 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30824 SDValue Amt1 =
30825 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30826 SDValue Sra1 =
30827 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30828 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30829 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30830 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30831 }
30832 }
30833
30834 // v4i32 Non Uniform Shifts.
30835 // If the shift amount is constant we can shift each lane using the SSE2
30836 // immediate shifts, else we need to zero-extend each lane to the lower i64
30837 // and shift using the SSE2 variable shifts.
30838 // The separate results can then be blended together.
30839 if (VT == MVT::v4i32) {
30840 SDValue Amt0, Amt1, Amt2, Amt3;
30841 if (ConstantAmt) {
30842 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30843 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30844 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30845 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30846 } else {
30847 // The SSE2 shifts use the lower i64 as the same shift amount for
30848 // all lanes and the upper i64 is ignored. On AVX we're better off
30849 // just zero-extending, but for SSE just duplicating the top 16-bits is
30850 // cheaper and has the same effect for out of range values.
30851 if (Subtarget.hasAVX()) {
30852 SDValue Z = DAG.getConstant(0, dl, VT);
30853 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30854 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30855 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30856 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30857 } else {
30858 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30859 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30860 {4, 5, 6, 7, -1, -1, -1, -1});
30861 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30862 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30863 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30864 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30865 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30866 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30867 }
30868 }
30869
30870 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30871 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30872 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30873 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30874 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30875
30876 // Merge the shifted lane results optimally with/without PBLENDW.
30877 // TODO - ideally shuffle combining would handle this.
30878 if (Subtarget.hasSSE41()) {
30879 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30880 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30881 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30882 }
30883 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30884 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30885 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30886 }
30887
30888 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30889 // look up the pre-computed shift values.
30890 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30891 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30892 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30893 unsigned NumLanes = VT.getSizeInBits() / 128u;
30894 unsigned NumEltsPerLane = NumElts / NumLanes;
30896 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30897 unsigned LoElt = Lane * NumEltsPerLane;
30898 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30899 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30900 if (!KnownLane.isConstant())
30901 break;
30902 const APInt &LaneSplat = KnownLane.getConstant();
30903 for (unsigned I = 0; I != 8; ++I) {
30904 if (Opc == ISD::SHL)
30905 LUT.push_back(LaneSplat.shl(I));
30906 else if (Opc == ISD::SRL)
30907 LUT.push_back(LaneSplat.lshr(I));
30908 else if (Opc == ISD::SRA)
30909 LUT.push_back(LaneSplat.ashr(I));
30910 }
30911 LUT.append(8, APInt::getZero(8));
30912 }
30913 if (LUT.size() == NumElts) {
30914 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30915 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30916 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30917 }
30918 }
30919
30920 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30921 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30922 // make the existing SSE solution better.
30923 // NOTE: We honor prefered vector width before promoting to 512-bits.
30924 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30925 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30926 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30927 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30928 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30929 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30930 "Unexpected vector type");
30931 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30932 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30933 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30934 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30935 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30936 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30937 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30938 }
30939
30940 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30941 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30942 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30943 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30944 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30945 !Subtarget.hasXOP()) {
30946 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30947 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30948
30949 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30950 // isn't legal).
30951 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30952 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30953 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30954 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30956 "Constant build vector expected");
30957
30958 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30959 bool IsSigned = Opc == ISD::SRA;
30960 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30961 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30962 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30963 return DAG.getZExtOrTrunc(R, dl, VT);
30964 }
30965
30966 SmallVector<SDValue, 16> LoAmt, HiAmt;
30967 for (unsigned i = 0; i != NumElts; i += 16) {
30968 for (int j = 0; j != 8; ++j) {
30969 LoAmt.push_back(Amt.getOperand(i + j));
30970 HiAmt.push_back(Amt.getOperand(i + j + 8));
30971 }
30972 }
30973
30974 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30975 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30976
30977 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30978 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30979 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30980 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30981 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30982 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30983 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30984 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30985 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30986 }
30987
30988 if (VT == MVT::v16i8 ||
30989 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30990 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30991 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30992
30993 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30994 if (VT.is512BitVector()) {
30995 // On AVX512BW targets we make use of the fact that VSELECT lowers
30996 // to a masked blend which selects bytes based just on the sign bit
30997 // extracted to a mask.
30998 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30999 V0 = DAG.getBitcast(VT, V0);
31000 V1 = DAG.getBitcast(VT, V1);
31001 Sel = DAG.getBitcast(VT, Sel);
31002 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31003 ISD::SETGT);
31004 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31005 } else if (Subtarget.hasSSE41()) {
31006 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31007 // on the sign bit.
31008 V0 = DAG.getBitcast(VT, V0);
31009 V1 = DAG.getBitcast(VT, V1);
31010 Sel = DAG.getBitcast(VT, Sel);
31011 return DAG.getBitcast(SelVT,
31012 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31013 }
31014 // On pre-SSE41 targets we test for the sign bit by comparing to
31015 // zero - a negative value will set all bits of the lanes to true
31016 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31017 SDValue Z = DAG.getConstant(0, dl, SelVT);
31018 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31019 return DAG.getSelect(dl, SelVT, C, V0, V1);
31020 };
31021
31022 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31023 // We can safely do this using i16 shifts as we're only interested in
31024 // the 3 lower bits of each byte.
31025 Amt = DAG.getBitcast(ExtVT, Amt);
31026 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31027 Amt = DAG.getBitcast(VT, Amt);
31028
31029 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31030 // r = VSELECT(r, shift(r, 4), a);
31031 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31032 R = SignBitSelect(VT, Amt, M, R);
31033
31034 // a += a
31035 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31036
31037 // r = VSELECT(r, shift(r, 2), a);
31038 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31039 R = SignBitSelect(VT, Amt, M, R);
31040
31041 // a += a
31042 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31043
31044 // return VSELECT(r, shift(r, 1), a);
31045 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31046 R = SignBitSelect(VT, Amt, M, R);
31047 return R;
31048 }
31049
31050 if (Opc == ISD::SRA) {
31051 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31052 // so we can correctly sign extend. We don't care what happens to the
31053 // lower byte.
31054 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31055 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31056 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31057 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31058 ALo = DAG.getBitcast(ExtVT, ALo);
31059 AHi = DAG.getBitcast(ExtVT, AHi);
31060 RLo = DAG.getBitcast(ExtVT, RLo);
31061 RHi = DAG.getBitcast(ExtVT, RHi);
31062
31063 // r = VSELECT(r, shift(r, 4), a);
31064 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31065 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31066 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31067 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31068
31069 // a += a
31070 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31071 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31072
31073 // r = VSELECT(r, shift(r, 2), a);
31074 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31075 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31076 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31077 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31078
31079 // a += a
31080 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31081 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31082
31083 // r = VSELECT(r, shift(r, 1), a);
31084 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31085 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31086 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31087 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31088
31089 // Logical shift the result back to the lower byte, leaving a zero upper
31090 // byte meaning that we can safely pack with PACKUSWB.
31091 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31092 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31093 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31094 }
31095 }
31096
31097 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31098 MVT ExtVT = MVT::v8i32;
31099 SDValue Z = DAG.getConstant(0, dl, VT);
31100 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31101 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31102 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31103 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31104 ALo = DAG.getBitcast(ExtVT, ALo);
31105 AHi = DAG.getBitcast(ExtVT, AHi);
31106 RLo = DAG.getBitcast(ExtVT, RLo);
31107 RHi = DAG.getBitcast(ExtVT, RHi);
31108 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31109 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31110 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31111 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31112 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31113 }
31114
31115 if (VT == MVT::v8i16) {
31116 // If we have a constant shift amount, the non-SSE41 path is best as
31117 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31118 bool UseSSE41 = Subtarget.hasSSE41() &&
31120
31121 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31122 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31123 // the sign bit.
31124 if (UseSSE41) {
31125 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31126 V0 = DAG.getBitcast(ExtVT, V0);
31127 V1 = DAG.getBitcast(ExtVT, V1);
31128 Sel = DAG.getBitcast(ExtVT, Sel);
31129 return DAG.getBitcast(
31130 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31131 }
31132 // On pre-SSE41 targets we splat the sign bit - a negative value will
31133 // set all bits of the lanes to true and VSELECT uses that in
31134 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31135 SDValue C =
31136 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31137 return DAG.getSelect(dl, VT, C, V0, V1);
31138 };
31139
31140 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31141 if (UseSSE41) {
31142 // On SSE41 targets we need to replicate the shift mask in both
31143 // bytes for PBLENDVB.
31144 Amt = DAG.getNode(
31145 ISD::OR, dl, VT,
31146 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31147 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31148 } else {
31149 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31150 }
31151
31152 // r = VSELECT(r, shift(r, 8), a);
31153 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31154 R = SignBitSelect(Amt, M, R);
31155
31156 // a += a
31157 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31158
31159 // r = VSELECT(r, shift(r, 4), a);
31160 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31161 R = SignBitSelect(Amt, M, R);
31162
31163 // a += a
31164 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31165
31166 // r = VSELECT(r, shift(r, 2), a);
31167 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31168 R = SignBitSelect(Amt, M, R);
31169
31170 // a += a
31171 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31172
31173 // return VSELECT(r, shift(r, 1), a);
31174 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31175 R = SignBitSelect(Amt, M, R);
31176 return R;
31177 }
31178
31179 // Decompose 256-bit shifts into 128-bit shifts.
31180 if (VT.is256BitVector())
31181 return splitVectorIntBinary(Op, DAG, dl);
31182
31183 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31184 return splitVectorIntBinary(Op, DAG, dl);
31185
31186 return SDValue();
31187}
31188
31190 SelectionDAG &DAG) {
31191 MVT VT = Op.getSimpleValueType();
31192 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31193 "Unexpected funnel shift opcode!");
31194
31195 SDLoc DL(Op);
31196 SDValue Op0 = Op.getOperand(0);
31197 SDValue Op1 = Op.getOperand(1);
31198 SDValue Amt = Op.getOperand(2);
31199 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31200 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31201
31202 if (VT.isVector()) {
31203 APInt APIntShiftAmt;
31204 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31205 unsigned NumElts = VT.getVectorNumElements();
31206
31207 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31208 if (IsFSHR)
31209 std::swap(Op0, Op1);
31210
31211 if (IsCstSplat) {
31212 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31213 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31214 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31215 {Op0, Op1, Imm}, DAG, Subtarget);
31216 }
31217 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31218 {Op0, Op1, Amt}, DAG, Subtarget);
31219 }
31220 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31221 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31222 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31223 "Unexpected funnel shift type!");
31224
31225 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31226 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31227 if (IsCstSplat) {
31228 // TODO: Can't use generic expansion as UNDEF amt elements can be
31229 // converted to other values when folded to shift amounts, losing the
31230 // splat.
31231 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31232 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31233 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31234 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31235 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31236
31237 if (EltSizeInBits == 8 &&
31238 (Subtarget.hasXOP() ||
31239 (useVPTERNLOG(Subtarget, VT) &&
31240 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31241 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31242 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31243 // the original vector width to handle cases where we split.
31244 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31245 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31246 SDValue ShX =
31247 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31248 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31249 SDValue ShY =
31250 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31251 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31252 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31253 DAG.getConstant(MaskX, DL, VT));
31254 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31255 DAG.getConstant(MaskY, DL, VT));
31256 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31257 }
31258
31259 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31260 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31261 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31262 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31263 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31264 }
31265
31266 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31267 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31268 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31269
31270 // Constant vXi16 funnel shifts can be efficiently handled by default.
31271 if (IsCst && EltSizeInBits == 16)
31272 return SDValue();
31273
31274 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31275 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31276 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31277
31278 // Split 256-bit integers on XOP/pre-AVX2 targets.
31279 // Split 512-bit integers on non 512-bit BWI targets.
31280 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31281 !Subtarget.hasAVX2())) ||
31282 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31283 EltSizeInBits < 32)) {
31284 // Pre-mask the amount modulo using the wider vector.
31285 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31286 return splitVectorOp(Op, DAG, DL);
31287 }
31288
31289 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31290 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31291 int ScalarAmtIdx = -1;
31292 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31293 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31294 if (EltSizeInBits == 16)
31295 return SDValue();
31296
31297 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31298 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31299 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31300 ScalarAmtIdx, Subtarget, DAG);
31301 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31302 ScalarAmtIdx, Subtarget, DAG);
31303 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31304 }
31305 }
31306
31307 MVT WideSVT = MVT::getIntegerVT(
31308 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31309 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31310
31311 // If per-element shifts are legal, fallback to generic expansion.
31312 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31313 return SDValue();
31314
31315 // Attempt to fold as:
31316 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31317 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31318 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31319 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31320 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31321 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31322 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31323 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31324 EltSizeInBits, DAG);
31325 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31326 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31327 if (!IsFSHR)
31328 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31329 EltSizeInBits, DAG);
31330 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31331 }
31332
31333 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31334 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31335 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31336 SDValue Z = DAG.getConstant(0, DL, VT);
31337 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31338 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31339 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31340 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31341 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31342 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31343 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31344 }
31345
31346 // Fallback to generic expansion.
31347 return SDValue();
31348 }
31349 assert(
31350 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31351 "Unexpected funnel shift type!");
31352
31353 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31354 bool OptForSize = DAG.shouldOptForSize();
31355 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31356
31357 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31358 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31359 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31360 !isa<ConstantSDNode>(Amt)) {
31361 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31362 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31363 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31364 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31365 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31366 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31367 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31368 if (IsFSHR) {
31369 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31370 } else {
31371 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31372 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31373 }
31374 return DAG.getZExtOrTrunc(Res, DL, VT);
31375 }
31376
31377 if (VT == MVT::i8 || ExpandFunnel)
31378 return SDValue();
31379
31380 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31381 if (VT == MVT::i16) {
31382 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31383 DAG.getConstant(15, DL, Amt.getValueType()));
31384 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31385 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31386 }
31387
31388 return Op;
31389}
31390
31391static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31392 SelectionDAG &DAG) {
31393 MVT VT = Op.getSimpleValueType();
31394 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31395
31396 SDLoc DL(Op);
31397 SDValue R = Op.getOperand(0);
31398 SDValue Amt = Op.getOperand(1);
31399 unsigned Opcode = Op.getOpcode();
31400 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31401 int NumElts = VT.getVectorNumElements();
31402 bool IsROTL = Opcode == ISD::ROTL;
31403
31404 // Check for constant splat rotation amount.
31405 APInt CstSplatValue;
31406 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31407
31408 // Check for splat rotate by zero.
31409 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31410 return R;
31411
31412 // AVX512 implicitly uses modulo rotation amounts.
31413 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31414 // Attempt to rotate by immediate.
31415 if (IsCstSplat) {
31416 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31417 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31418 return DAG.getNode(RotOpc, DL, VT, R,
31419 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31420 }
31421
31422 // Else, fall-back on VPROLV/VPRORV.
31423 return Op;
31424 }
31425
31426 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31427 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31428 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31429 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31430 }
31431
31432 SDValue Z = DAG.getConstant(0, DL, VT);
31433
31434 if (!IsROTL) {
31435 // If the ISD::ROTR amount is constant, we're always better converting to
31436 // ISD::ROTL.
31437 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31438 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31439
31440 // XOP targets always prefers ISD::ROTL.
31441 if (Subtarget.hasXOP())
31442 return DAG.getNode(ISD::ROTL, DL, VT, R,
31443 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31444 }
31445
31446 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31447 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31449 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31450 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31451 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31452 DAG.getTargetConstant(0, DL, MVT::i8));
31453 }
31454
31455 // Split 256-bit integers on XOP/pre-AVX2 targets.
31456 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31457 return splitVectorIntBinary(Op, DAG, DL);
31458
31459 // XOP has 128-bit vector variable + immediate rotates.
31460 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31461 // XOP implicitly uses modulo rotation amounts.
31462 if (Subtarget.hasXOP()) {
31463 assert(IsROTL && "Only ROTL expected");
31464 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31465
31466 // Attempt to rotate by immediate.
31467 if (IsCstSplat) {
31468 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31469 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31470 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31471 }
31472
31473 // Use general rotate by variable (per-element).
31474 return Op;
31475 }
31476
31477 // Rotate by an uniform constant - expand back to shifts.
31478 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31479 // to other values when folded to shift amounts, losing the splat.
31480 if (IsCstSplat) {
31481 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31482 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31483 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31484 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31485 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31486 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31487 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31488 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31489 }
31490
31491 // Split 512-bit integers on non 512-bit BWI targets.
31492 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31493 return splitVectorIntBinary(Op, DAG, DL);
31494
31495 assert(
31496 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31497 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31498 Subtarget.hasAVX2()) ||
31499 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31500 "Only vXi32/vXi16/vXi8 vector rotates supported");
31501
31502 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31503 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31504
31505 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31506 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31507
31508 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31509 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31510 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31511 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31512 int BaseRotAmtIdx = -1;
31513 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31514 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31515 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31516 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31517 }
31518 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31519 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31520 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31521 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31522 BaseRotAmtIdx, Subtarget, DAG);
31523 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31524 BaseRotAmtIdx, Subtarget, DAG);
31525 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31526 }
31527 }
31528
31529 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31530 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31531
31532 // Attempt to fold as unpack(x,x) << zext(y):
31533 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31534 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31535 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31536 if (!(ConstantAmt && EltSizeInBits != 8) &&
31537 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31538 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31539 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31540 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31541 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31542 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31543 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31544 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31545 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31546 }
31547
31548 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31549 // the amount bit.
31550 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31551 if (EltSizeInBits == 8) {
31552 MVT WideVT =
31553 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31554
31555 // Attempt to fold as:
31556 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31557 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31558 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31559 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31560 // If we're rotating by constant, just use default promotion.
31561 if (ConstantAmt)
31562 return SDValue();
31563 // See if we can perform this by widening to vXi16 or vXi32.
31564 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31565 R = DAG.getNode(
31566 ISD::OR, DL, WideVT, R,
31567 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31568 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31569 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31570 if (IsROTL)
31571 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31572 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31573 }
31574
31575 // We don't need ModuloAmt here as we just peek at individual bits.
31576 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31577 if (Subtarget.hasSSE41()) {
31578 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31579 // on the sign bit.
31580 V0 = DAG.getBitcast(VT, V0);
31581 V1 = DAG.getBitcast(VT, V1);
31582 Sel = DAG.getBitcast(VT, Sel);
31583 return DAG.getBitcast(SelVT,
31584 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31585 }
31586 // On pre-SSE41 targets we test for the sign bit by comparing to
31587 // zero - a negative value will set all bits of the lanes to true
31588 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31589 SDValue Z = DAG.getConstant(0, DL, SelVT);
31590 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31591 return DAG.getSelect(DL, SelVT, C, V0, V1);
31592 };
31593
31594 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31595 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31596 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31597 IsROTL = true;
31598 }
31599
31600 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31601 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31602
31603 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31604 // We can safely do this using i16 shifts as we're only interested in
31605 // the 3 lower bits of each byte.
31606 Amt = DAG.getBitcast(ExtVT, Amt);
31607 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31608 Amt = DAG.getBitcast(VT, Amt);
31609
31610 // r = VSELECT(r, rot(r, 4), a);
31611 SDValue M;
31612 M = DAG.getNode(
31613 ISD::OR, DL, VT,
31614 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31615 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31616 R = SignBitSelect(VT, Amt, M, R);
31617
31618 // a += a
31619 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31620
31621 // r = VSELECT(r, rot(r, 2), a);
31622 M = DAG.getNode(
31623 ISD::OR, DL, VT,
31624 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31625 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31626 R = SignBitSelect(VT, Amt, M, R);
31627
31628 // a += a
31629 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31630
31631 // return VSELECT(r, rot(r, 1), a);
31632 M = DAG.getNode(
31633 ISD::OR, DL, VT,
31634 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31635 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31636 return SignBitSelect(VT, Amt, M, R);
31637 }
31638
31639 bool IsSplatAmt = DAG.isSplatValue(Amt);
31640 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31641 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31642
31643 // Fallback for splats + all supported variable shifts.
31644 // Fallback for non-constants AVX2 vXi16 as well.
31645 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31646 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31647 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31648 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31649 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31650 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31651 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31652 }
31653
31654 // Everything below assumes ISD::ROTL.
31655 if (!IsROTL) {
31656 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31657 IsROTL = true;
31658 }
31659
31660 // ISD::ROT* uses modulo rotate amounts.
31661 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31662
31663 assert(IsROTL && "Only ROTL supported");
31664
31665 // As with shifts, attempt to convert the rotation amount to a multiplication
31666 // factor, fallback to general expansion.
31667 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31668 if (!Scale)
31669 return SDValue();
31670
31671 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31672 if (EltSizeInBits == 16) {
31673 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31674 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31675 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31676 }
31677
31678 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31679 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31680 // that can then be OR'd with the lower 32-bits.
31681 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31682 static const int OddMask[] = {1, 1, 3, 3};
31683 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31684 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31685
31686 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31687 DAG.getBitcast(MVT::v2i64, R),
31688 DAG.getBitcast(MVT::v2i64, Scale));
31689 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31690 DAG.getBitcast(MVT::v2i64, R13),
31691 DAG.getBitcast(MVT::v2i64, Scale13));
31692 Res02 = DAG.getBitcast(VT, Res02);
31693 Res13 = DAG.getBitcast(VT, Res13);
31694
31695 return DAG.getNode(ISD::OR, DL, VT,
31696 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31697 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31698}
31699
31700/// Returns true if the operand type is exactly twice the native width, and
31701/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31702/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31703/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31704bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31705 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31706
31707 if (OpWidth == 64)
31708 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31709 if (OpWidth == 128)
31710 return Subtarget.canUseCMPXCHG16B();
31711
31712 return false;
31713}
31714
31716X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31717 Type *MemType = SI->getValueOperand()->getType();
31718
31719 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31720 !Subtarget.useSoftFloat()) {
31721 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31722 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31724
31725 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31726 Subtarget.hasAVX())
31728 }
31729
31730 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31732}
31733
31734// Note: this turns large loads into lock cmpxchg8b/16b.
31736X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31737 Type *MemType = LI->getType();
31738
31739 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31740 !Subtarget.useSoftFloat()) {
31741 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31742 // can use movq to do the load. If we have X87 we can load into an 80-bit
31743 // X87 register and store it to a stack temporary.
31744 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31745 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31747
31748 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31749 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31750 Subtarget.hasAVX())
31752 }
31753
31754 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31756}
31757
31765
31766static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31767 using namespace llvm::PatternMatch;
31768 BitTestKind BTK = UndefBit;
31769 if (auto *C = dyn_cast<ConstantInt>(V)) {
31770 // Check if V is a power of 2 or NOT power of 2.
31771 if (isPowerOf2_64(C->getZExtValue()))
31772 BTK = ConstantBit;
31773 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31774 BTK = NotConstantBit;
31775 return {V, BTK};
31776 }
31777
31778 // Check if V is some power of 2 pattern known to be non-zero
31779 if (auto *I = dyn_cast<Instruction>(V)) {
31780 bool Not = false;
31781 // Check if we have a NOT
31782 Value *PeekI;
31783 if (match(I, m_Not(m_Value(PeekI))) ||
31784 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31785 Not = true;
31786 I = dyn_cast<Instruction>(PeekI);
31787
31788 // If I is constant, it will fold and we can evaluate later. If its an
31789 // argument or something of that nature, we can't analyze.
31790 if (I == nullptr)
31791 return {nullptr, UndefBit};
31792 }
31793 // We can only use 1 << X without more sophisticated analysis. C << X where
31794 // C is a power of 2 but not 1 can result in zero which cannot be translated
31795 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31796 if (I->getOpcode() == Instruction::Shl) {
31797 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31798 // -X` and some other provable power of 2 patterns that we can use CTZ on
31799 // may be profitable.
31800 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31801 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31802 // be provably a non-zero power of 2.
31803 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31804 // transformable to bittest.
31805 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31806 if (!ShiftVal)
31807 return {nullptr, UndefBit};
31808 if (ShiftVal->equalsInt(1))
31809 BTK = Not ? NotShiftBit : ShiftBit;
31810
31811 if (BTK == UndefBit)
31812 return {nullptr, UndefBit};
31813
31814 Value *BitV = I->getOperand(1);
31815
31816 // Read past a shiftmask instruction to find count
31817 Value *AndOp;
31818 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31819 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31820 BitV = AndOp;
31821
31822 return {BitV, BTK};
31823 }
31824 }
31825 return {nullptr, UndefBit};
31826}
31827
31829X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31830 using namespace llvm::PatternMatch;
31831 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31832 // prefix to a normal instruction for these operations.
31833 if (AI->use_empty())
31835
31836 if (AI->getOperation() == AtomicRMWInst::Xor) {
31837 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31838 // preferable to both `cmpxchg` and `btc`.
31839 if (match(AI->getOperand(1), m_SignMask()))
31841 }
31842
31843 // If the atomicrmw's result is used by a single bit AND, we may use
31844 // bts/btr/btc instruction for these operations.
31845 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31846 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31847 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31848 // detect it.
31849 Instruction *I = AI->user_back();
31850 auto BitChange = FindSingleBitChange(AI->getValOperand());
31851 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31852 I->getOpcode() != Instruction::And ||
31853 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31854 AI->getParent() != I->getParent())
31856
31857 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31858
31859 // This is a redundant AND, it should get cleaned up elsewhere.
31860 if (AI == I->getOperand(OtherIdx))
31862
31863 // The following instruction must be a AND single bit.
31864 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31865 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31866 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31867 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31869 }
31870 if (AI->getOperation() == AtomicRMWInst::And) {
31871 return ~C1->getValue() == C2->getValue()
31874 }
31877 }
31878
31879 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31880
31881 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31882 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31884
31885 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31886
31887 // If shift amounts are not the same we can't use BitTestIntrinsic.
31888 if (BitChange.first != BitTested.first)
31890
31891 // If atomic AND need to be masking all be one bit and testing the one bit
31892 // unset in the mask.
31893 if (AI->getOperation() == AtomicRMWInst::And)
31894 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31897
31898 // If atomic XOR/OR need to be setting and testing the same bit.
31899 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31902}
31903
31904void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31905 IRBuilder<> Builder(AI);
31906 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31909 switch (AI->getOperation()) {
31910 default:
31911 llvm_unreachable("Unknown atomic operation");
31912 case AtomicRMWInst::Or:
31913 IID_C = Intrinsic::x86_atomic_bts;
31914 IID_I = Intrinsic::x86_atomic_bts_rm;
31915 break;
31916 case AtomicRMWInst::Xor:
31917 IID_C = Intrinsic::x86_atomic_btc;
31918 IID_I = Intrinsic::x86_atomic_btc_rm;
31919 break;
31920 case AtomicRMWInst::And:
31921 IID_C = Intrinsic::x86_atomic_btr;
31922 IID_I = Intrinsic::x86_atomic_btr_rm;
31923 break;
31924 }
31925 Instruction *I = AI->user_back();
31926 LLVMContext &Ctx = AI->getContext();
31927 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31929 Value *Result = nullptr;
31930 auto BitTested = FindSingleBitChange(AI->getValOperand());
31931 assert(BitTested.first != nullptr);
31932
31933 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31934 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31935
31936 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31937 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31938 {Addr, Builder.getInt8(Imm)});
31939 } else {
31940 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31941
31942 Value *SI = BitTested.first;
31943 assert(SI != nullptr);
31944
31945 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31946 // mask it.
31947 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31948 Value *BitPos =
31949 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31950 // Todo(1): In many cases it may be provable that SI is less than
31951 // ShiftBits in which case this mask is unnecessary
31952 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31953 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31954 // favor of just a raw BT{S|R|C}.
31955
31956 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31957 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31958
31959 // If the result is only used for zero/non-zero status then we don't need to
31960 // shift value back. Otherwise do so.
31961 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31962 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31963 if (ICmp->isEquality()) {
31964 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31965 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31966 if (C0 || C1) {
31967 assert(C0 == nullptr || C1 == nullptr);
31968 if ((C0 ? C0 : C1)->isZero())
31969 continue;
31970 }
31971 }
31972 }
31973 Result = Builder.CreateShl(Result, BitPos);
31974 break;
31975 }
31976 }
31977
31978 I->replaceAllUsesWith(Result);
31979 I->eraseFromParent();
31980 AI->eraseFromParent();
31981}
31982
31984 using namespace llvm::PatternMatch;
31985 if (!AI->hasOneUse())
31986 return false;
31987
31988 Value *Op = AI->getOperand(1);
31989 CmpPredicate Pred;
31990 Instruction *I = AI->user_back();
31992 if (Opc == AtomicRMWInst::Add) {
31993 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31994 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31995 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31996 if (match(I->user_back(),
31998 return true;
31999 if (match(I->user_back(),
32001 return true;
32002 }
32003 return false;
32004 }
32005 if (Opc == AtomicRMWInst::Sub) {
32006 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32007 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32008 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32009 if (match(I->user_back(),
32011 return true;
32012 if (match(I->user_back(),
32014 return true;
32015 }
32016 return false;
32017 }
32018 if ((Opc == AtomicRMWInst::Or &&
32020 (Opc == AtomicRMWInst::And &&
32022 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32023 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32024 Pred == CmpInst::ICMP_SLT;
32025 if (match(I->user_back(),
32027 return true;
32028 return false;
32029 }
32030 if (Opc == AtomicRMWInst::Xor) {
32031 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32032 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32033 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32034 if (match(I->user_back(),
32036 return true;
32037 if (match(I->user_back(),
32039 return true;
32040 }
32041 return false;
32042 }
32043
32044 return false;
32045}
32046
32047void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32048 AtomicRMWInst *AI) const {
32049 IRBuilder<> Builder(AI);
32050 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32051 Instruction *TempI = nullptr;
32052 LLVMContext &Ctx = AI->getContext();
32053 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32054 if (!ICI) {
32055 TempI = AI->user_back();
32056 assert(TempI->hasOneUse() && "Must have one use");
32057 ICI = cast<ICmpInst>(TempI->user_back());
32058 }
32060 ICmpInst::Predicate Pred = ICI->getPredicate();
32061 switch (Pred) {
32062 default:
32063 llvm_unreachable("Not supported Pred");
32064 case CmpInst::ICMP_EQ:
32065 CC = X86::COND_E;
32066 break;
32067 case CmpInst::ICMP_NE:
32068 CC = X86::COND_NE;
32069 break;
32070 case CmpInst::ICMP_SLT:
32071 CC = X86::COND_S;
32072 break;
32073 case CmpInst::ICMP_SGT:
32074 CC = X86::COND_NS;
32075 break;
32076 }
32078 switch (AI->getOperation()) {
32079 default:
32080 llvm_unreachable("Unknown atomic operation");
32081 case AtomicRMWInst::Add:
32082 IID = Intrinsic::x86_atomic_add_cc;
32083 break;
32084 case AtomicRMWInst::Sub:
32085 IID = Intrinsic::x86_atomic_sub_cc;
32086 break;
32087 case AtomicRMWInst::Or:
32088 IID = Intrinsic::x86_atomic_or_cc;
32089 break;
32090 case AtomicRMWInst::And:
32091 IID = Intrinsic::x86_atomic_and_cc;
32092 break;
32093 case AtomicRMWInst::Xor:
32094 IID = Intrinsic::x86_atomic_xor_cc;
32095 break;
32096 }
32097 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32099 Value *Call = Builder.CreateIntrinsic(
32100 IID, AI->getType(),
32101 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32102 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32103 ICI->replaceAllUsesWith(Result);
32104 ICI->eraseFromParent();
32105 if (TempI)
32106 TempI->eraseFromParent();
32107 AI->eraseFromParent();
32108}
32109
32111X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32112 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32113 Type *MemType = AI->getType();
32114
32115 // If the operand is too big, we must see if cmpxchg8/16b is available
32116 // and default to library calls otherwise.
32117 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32118 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32120 }
32121
32123 switch (Op) {
32126 case AtomicRMWInst::Add:
32127 case AtomicRMWInst::Sub:
32130 // It's better to use xadd, xsub or xchg for these in other cases.
32132 case AtomicRMWInst::Or:
32133 case AtomicRMWInst::And:
32134 case AtomicRMWInst::Xor:
32137 return shouldExpandLogicAtomicRMWInIR(AI);
32139 case AtomicRMWInst::Max:
32140 case AtomicRMWInst::Min:
32151 default:
32152 // These always require a non-trivial set of data operations on x86. We must
32153 // use a cmpxchg loop.
32155 }
32156}
32157
32158LoadInst *
32159X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32160 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32161 Type *MemType = AI->getType();
32162 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32163 // there is no benefit in turning such RMWs into loads, and it is actually
32164 // harmful as it introduces a mfence.
32165 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32166 return nullptr;
32167
32168 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32169 // lowering available in lowerAtomicArith.
32170 // TODO: push more cases through this path.
32171 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32172 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32173 AI->use_empty())
32174 return nullptr;
32175
32176 IRBuilder<> Builder(AI);
32177 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32178 auto SSID = AI->getSyncScopeID();
32179 // We must restrict the ordering to avoid generating loads with Release or
32180 // ReleaseAcquire orderings.
32182
32183 // Before the load we need a fence. Here is an example lifted from
32184 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32185 // is required:
32186 // Thread 0:
32187 // x.store(1, relaxed);
32188 // r1 = y.fetch_add(0, release);
32189 // Thread 1:
32190 // y.fetch_add(42, acquire);
32191 // r2 = x.load(relaxed);
32192 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32193 // lowered to just a load without a fence. A mfence flushes the store buffer,
32194 // making the optimization clearly correct.
32195 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32196 // otherwise, we might be able to be more aggressive on relaxed idempotent
32197 // rmw. In practice, they do not look useful, so we don't try to be
32198 // especially clever.
32199
32200 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32201 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32202 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32203
32204 // Finally we can emit the atomic load.
32205 LoadInst *Loaded = Builder.CreateAlignedLoad(
32206 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32207 Loaded->setAtomic(Order, SSID);
32208 AI->replaceAllUsesWith(Loaded);
32209 AI->eraseFromParent();
32210 return Loaded;
32211}
32212
32213/// Emit a locked operation on a stack location which does not change any
32214/// memory location, but does involve a lock prefix. Location is chosen to be
32215/// a) very likely accessed only by a single thread to minimize cache traffic,
32216/// and b) definitely dereferenceable. Returns the new Chain result.
32218 const X86Subtarget &Subtarget, SDValue Chain,
32219 const SDLoc &DL) {
32220 // Implementation notes:
32221 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32222 // operations issued by the current processor. As such, the location
32223 // referenced is not relevant for the ordering properties of the instruction.
32224 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32225 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32226 // 2) Using an immediate operand appears to be the best encoding choice
32227 // here since it doesn't require an extra register.
32228 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32229 // is small enough it might just be measurement noise.)
32230 // 4) When choosing offsets, there are several contributing factors:
32231 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32232 // line aligned stack object to improve this case.)
32233 // b) To minimize our chances of introducing a false dependence, we prefer
32234 // to offset the stack usage from TOS slightly.
32235 // c) To minimize concerns about cross thread stack usage - in particular,
32236 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32237 // captures state in the TOS frame and accesses it from many threads -
32238 // we want to use an offset such that the offset is in a distinct cache
32239 // line from the TOS frame.
32240 //
32241 // For a general discussion of the tradeoffs and benchmark results, see:
32242 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32243
32244 auto &MF = DAG.getMachineFunction();
32245 auto &TFL = *Subtarget.getFrameLowering();
32246 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32247
32248 if (Subtarget.is64Bit()) {
32249 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32250 SDValue Ops[] = {
32251 DAG.getRegister(X86::RSP, MVT::i64), // Base
32252 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32253 DAG.getRegister(0, MVT::i64), // Index
32254 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32255 DAG.getRegister(0, MVT::i16), // Segment.
32256 Zero,
32257 Chain};
32258 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32259 MVT::Other, Ops);
32260 return SDValue(Res, 1);
32261 }
32262
32263 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32264 SDValue Ops[] = {
32265 DAG.getRegister(X86::ESP, MVT::i32), // Base
32266 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32267 DAG.getRegister(0, MVT::i32), // Index
32268 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32269 DAG.getRegister(0, MVT::i16), // Segment.
32270 Zero,
32271 Chain
32272 };
32273 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32274 MVT::Other, Ops);
32275 return SDValue(Res, 1);
32276}
32277
32279 SelectionDAG &DAG) {
32280 SDLoc dl(Op);
32281 AtomicOrdering FenceOrdering =
32282 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32283 SyncScope::ID FenceSSID =
32284 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32285
32286 // The only fence that needs an instruction is a sequentially-consistent
32287 // cross-thread fence.
32288 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32289 FenceSSID == SyncScope::System) {
32290 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32291 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32292
32293 SDValue Chain = Op.getOperand(0);
32294 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32295 }
32296
32297 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32298 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32299}
32300
32302 SelectionDAG &DAG) {
32303 MVT T = Op.getSimpleValueType();
32304 SDLoc DL(Op);
32305 unsigned Reg = 0;
32306 unsigned size = 0;
32307 switch(T.SimpleTy) {
32308 default: llvm_unreachable("Invalid value type!");
32309 case MVT::i8: Reg = X86::AL; size = 1; break;
32310 case MVT::i16: Reg = X86::AX; size = 2; break;
32311 case MVT::i32: Reg = X86::EAX; size = 4; break;
32312 case MVT::i64:
32313 assert(Subtarget.is64Bit() && "Node not type legal!");
32314 Reg = X86::RAX; size = 8;
32315 break;
32316 }
32317 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32318 Op.getOperand(2), SDValue());
32319 SDValue Ops[] = { cpIn.getValue(0),
32320 Op.getOperand(1),
32321 Op.getOperand(3),
32322 DAG.getTargetConstant(size, DL, MVT::i8),
32323 cpIn.getValue(1) };
32324 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32325 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32327 Ops, T, MMO);
32328
32329 SDValue cpOut =
32330 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32331 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32332 MVT::i32, cpOut.getValue(2));
32333 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32334
32335 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32336 cpOut, Success, EFLAGS.getValue(1));
32337}
32338
32339// Create MOVMSKB, taking into account whether we need to split for AVX1.
32341 const X86Subtarget &Subtarget) {
32342 MVT InVT = V.getSimpleValueType();
32343
32344 if (InVT == MVT::v64i8) {
32345 SDValue Lo, Hi;
32346 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32347 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32348 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32349 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32350 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32351 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32352 DAG.getConstant(32, DL, MVT::i8));
32353 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32354 }
32355 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32356 SDValue Lo, Hi;
32357 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32358 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32359 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32360 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32361 DAG.getConstant(16, DL, MVT::i8));
32362 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32363 }
32364
32365 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32366}
32367
32368static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32369 SelectionDAG &DAG) {
32370 SDValue Src = Op.getOperand(0);
32371 MVT SrcVT = Src.getSimpleValueType();
32372 MVT DstVT = Op.getSimpleValueType();
32373
32374 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32375 // half to v32i1 and concatenating the result.
32376 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32377 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32378 assert(Subtarget.hasBWI() && "Expected BWI target");
32379 SDLoc dl(Op);
32380 SDValue Lo, Hi;
32381 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32382 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32383 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32384 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32385 }
32386
32387 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32388 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32389 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32390 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32391 SDLoc DL(Op);
32392 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32393 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32394 return DAG.getZExtOrTrunc(V, DL, DstVT);
32395 }
32396
32397 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32398 SrcVT == MVT::i64) && "Unexpected VT!");
32399
32400 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32401 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32402 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32403 // This conversion needs to be expanded.
32404 return SDValue();
32405
32406 SDLoc dl(Op);
32407 if (SrcVT.isVector()) {
32408 // Widen the vector in input in the case of MVT::v2i32.
32409 // Example: from MVT::v2i32 to MVT::v4i32.
32411 SrcVT.getVectorNumElements() * 2);
32412 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32413 DAG.getUNDEF(SrcVT));
32414 } else {
32415 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32416 "Unexpected source type in LowerBITCAST");
32417 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32418 }
32419
32420 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32421 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32422
32423 if (DstVT == MVT::x86mmx)
32424 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32425
32426 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32427 DAG.getVectorIdxConstant(0, dl));
32428}
32429
32430/// Compute the horizontal sum of bytes in V for the elements of VT.
32431///
32432/// Requires V to be a byte vector and VT to be an integer vector type with
32433/// wider elements than V's type. The width of the elements of VT determines
32434/// how many bytes of V are summed horizontally to produce each element of the
32435/// result.
32437 const X86Subtarget &Subtarget,
32438 SelectionDAG &DAG) {
32439 SDLoc DL(V);
32440 MVT ByteVecVT = V.getSimpleValueType();
32441 MVT EltVT = VT.getVectorElementType();
32442 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32443 "Expected value to have byte element type.");
32444 assert(EltVT != MVT::i8 &&
32445 "Horizontal byte sum only makes sense for wider elements!");
32446 unsigned VecSize = VT.getSizeInBits();
32447 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32448
32449 // PSADBW instruction horizontally add all bytes and leave the result in i64
32450 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32451 if (EltVT == MVT::i64) {
32452 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32453 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32454 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32455 return DAG.getBitcast(VT, V);
32456 }
32457
32458 if (EltVT == MVT::i32) {
32459 // We unpack the low half and high half into i32s interleaved with zeros so
32460 // that we can use PSADBW to horizontally sum them. The most useful part of
32461 // this is that it lines up the results of two PSADBW instructions to be
32462 // two v2i64 vectors which concatenated are the 4 population counts. We can
32463 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32464 SDValue Zeros = DAG.getConstant(0, DL, VT);
32465 SDValue V32 = DAG.getBitcast(VT, V);
32466 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32467 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32468
32469 // Do the horizontal sums into two v2i64s.
32470 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32471 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32472 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32473 DAG.getBitcast(ByteVecVT, Low), Zeros);
32474 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32475 DAG.getBitcast(ByteVecVT, High), Zeros);
32476
32477 // Merge them together.
32478 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32479 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32480 DAG.getBitcast(ShortVecVT, Low),
32481 DAG.getBitcast(ShortVecVT, High));
32482
32483 return DAG.getBitcast(VT, V);
32484 }
32485
32486 // The only element type left is i16.
32487 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32488
32489 // To obtain pop count for each i16 element starting from the pop count for
32490 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32491 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32492 // directly supported.
32493 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32494 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32495 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32496 DAG.getBitcast(ByteVecVT, V));
32497 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32498}
32499
32501 const X86Subtarget &Subtarget,
32502 SelectionDAG &DAG) {
32503 MVT VT = Op.getSimpleValueType();
32504 MVT EltVT = VT.getVectorElementType();
32505 int NumElts = VT.getVectorNumElements();
32506 (void)EltVT;
32507 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32508
32509 // Implement a lookup table in register by using an algorithm based on:
32510 // http://wm.ite.pl/articles/sse-popcount.html
32511 //
32512 // The general idea is that every lower byte nibble in the input vector is an
32513 // index into a in-register pre-computed pop count table. We then split up the
32514 // input vector in two new ones: (1) a vector with only the shifted-right
32515 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32516 // masked out higher ones) for each byte. PSHUFB is used separately with both
32517 // to index the in-register table. Next, both are added and the result is a
32518 // i8 vector where each element contains the pop count for input byte.
32519 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32520 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32521 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32522 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32523
32525 for (int i = 0; i < NumElts; ++i)
32526 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32527 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32528 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32529
32530 // High nibbles
32531 SDValue FourV = DAG.getConstant(4, DL, VT);
32532 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32533
32534 // Low nibbles
32535 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32536
32537 // The input vector is used as the shuffle mask that index elements into the
32538 // LUT. After counting low and high nibbles, add the vector to obtain the
32539 // final pop count per i8 element.
32540 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32541 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32542 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32543}
32544
32545// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32546// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32548 const X86Subtarget &Subtarget,
32549 SelectionDAG &DAG) {
32550 MVT VT = Op.getSimpleValueType();
32551 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32552 "Unknown CTPOP type to handle");
32553 SDValue Op0 = Op.getOperand(0);
32554
32555 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32556 if (Subtarget.hasVPOPCNTDQ()) {
32557 unsigned NumElems = VT.getVectorNumElements();
32558 assert((VT.getVectorElementType() == MVT::i8 ||
32559 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32560 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32561 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32562 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32563 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32564 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32565 }
32566 }
32567
32568 // Decompose 256-bit ops into smaller 128-bit ops.
32569 if (VT.is256BitVector() && !Subtarget.hasInt256())
32570 return splitVectorIntUnary(Op, DAG, DL);
32571
32572 // Decompose 512-bit ops into smaller 256-bit ops.
32573 if (VT.is512BitVector() && !Subtarget.hasBWI())
32574 return splitVectorIntUnary(Op, DAG, DL);
32575
32576 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32577 if (VT.getScalarType() != MVT::i8) {
32578 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32579 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32580 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32581 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32582 }
32583
32584 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32585 if (!Subtarget.hasSSSE3())
32586 return SDValue();
32587
32588 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32589}
32590
32591static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32592 SelectionDAG &DAG) {
32593 MVT VT = N.getSimpleValueType();
32594 SDValue Op = N.getOperand(0);
32595 SDLoc DL(N);
32596
32597 if (VT.isScalarInteger()) {
32598 // Compute the lower/upper bounds of the active bits of the value,
32599 // allowing us to shift the active bits down if necessary to fit into the
32600 // special cases below.
32601 KnownBits Known = DAG.computeKnownBits(Op);
32602 if (Known.isConstant())
32603 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32604 unsigned LZ = Known.countMinLeadingZeros();
32605 unsigned TZ = Known.countMinTrailingZeros();
32606 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32607 unsigned ActiveBits = Known.getBitWidth() - LZ;
32608 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32609
32610 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32611 if (ShiftedActiveBits <= 2) {
32612 if (ActiveBits > 2)
32613 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32614 DAG.getShiftAmountConstant(TZ, VT, DL));
32615 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32616 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32617 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32618 DAG.getShiftAmountConstant(1, VT, DL)));
32619 return DAG.getZExtOrTrunc(Op, DL, VT);
32620 }
32621
32622 // i3 CTPOP - perform LUT into i32 integer.
32623 if (ShiftedActiveBits <= 3) {
32624 if (ActiveBits > 3)
32625 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32626 DAG.getShiftAmountConstant(TZ, VT, DL));
32627 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32628 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32629 DAG.getShiftAmountConstant(1, VT, DL));
32630 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32631 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32632 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32633 DAG.getConstant(0x3, DL, MVT::i32));
32634 return DAG.getZExtOrTrunc(Op, DL, VT);
32635 }
32636
32637 // i4 CTPOP - perform LUT into i64 integer.
32638 if (ShiftedActiveBits <= 4 &&
32639 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32640 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32641 if (ActiveBits > 4)
32642 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32643 DAG.getShiftAmountConstant(TZ, VT, DL));
32644 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32645 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32646 DAG.getConstant(4, DL, MVT::i32));
32647 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32648 DAG.getShiftAmountOperand(MVT::i64, Op));
32649 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32650 DAG.getConstant(0x7, DL, MVT::i64));
32651 return DAG.getZExtOrTrunc(Op, DL, VT);
32652 }
32653
32654 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32655 if (ShiftedActiveBits <= 8) {
32656 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32657 if (ActiveBits > 8)
32658 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32659 DAG.getShiftAmountConstant(TZ, VT, DL));
32660 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32661 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32662 DAG.getConstant(0x08040201U, DL, MVT::i32));
32663 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32664 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32665 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32666 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32667 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32668 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32669 return DAG.getZExtOrTrunc(Op, DL, VT);
32670 }
32671
32672 return SDValue(); // fallback to generic expansion.
32673 }
32674
32675 assert(VT.isVector() &&
32676 "We only do custom lowering for vector population count.");
32677 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32678}
32679
32681 MVT VT = Op.getSimpleValueType();
32682 SDValue In = Op.getOperand(0);
32683 SDLoc DL(Op);
32684
32685 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32686 // perform the BITREVERSE.
32687 if (!VT.isVector()) {
32688 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32689 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32690 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32691 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32692 DAG.getVectorIdxConstant(0, DL));
32693 }
32694
32695 int NumElts = VT.getVectorNumElements();
32696 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32697
32698 // Decompose 256-bit ops into smaller 128-bit ops.
32699 if (VT.is256BitVector())
32700 return splitVectorIntUnary(Op, DAG, DL);
32701
32702 assert(VT.is128BitVector() &&
32703 "Only 128-bit vector bitreverse lowering supported.");
32704
32705 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32706 // perform the BSWAP in the shuffle.
32707 // Its best to shuffle using the second operand as this will implicitly allow
32708 // memory folding for multiple vectors.
32709 SmallVector<SDValue, 16> MaskElts;
32710 for (int i = 0; i != NumElts; ++i) {
32711 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32712 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32713 int PermuteByte = SourceByte | (2 << 5);
32714 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32715 }
32716 }
32717
32718 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32719 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32720 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32721 Res, Mask);
32722 return DAG.getBitcast(VT, Res);
32723}
32724
32726 SelectionDAG &DAG) {
32727 MVT VT = Op.getSimpleValueType();
32728
32729 if (Subtarget.hasXOP() && !VT.is512BitVector())
32730 return LowerBITREVERSE_XOP(Op, DAG);
32731
32732 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32733 "SSSE3 or GFNI required for BITREVERSE");
32734
32735 SDValue In = Op.getOperand(0);
32736 SDLoc DL(Op);
32737
32738 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32739 if (VT.is512BitVector() && !Subtarget.hasBWI())
32740 return splitVectorIntUnary(Op, DAG, DL);
32741
32742 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32743 if (VT.is256BitVector() && !Subtarget.hasInt256())
32744 return splitVectorIntUnary(Op, DAG, DL);
32745
32746 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32747 if (!VT.isVector()) {
32748 assert(
32749 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32750 "Only tested for i8/i16/i32/i64");
32751 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32752 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32753 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32754 DAG.getBitcast(MVT::v16i8, Res));
32755 Res =
32756 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32757 DAG.getVectorIdxConstant(0, DL));
32758 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32759 }
32760
32761 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32762
32763 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32764 if (VT.getScalarType() != MVT::i8) {
32765 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32766 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32767 Res = DAG.getBitcast(ByteVT, Res);
32768 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32769 return DAG.getBitcast(VT, Res);
32770 }
32771 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32772 "Only byte vector BITREVERSE supported");
32773
32774 unsigned NumElts = VT.getVectorNumElements();
32775
32776 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32777 if (Subtarget.hasGFNI()) {
32779 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32780 DAG.getTargetConstant(0, DL, MVT::i8));
32781 }
32782
32783 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32784 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32785 // 0-15 value (moved to the other nibble).
32786 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32787 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32788 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32789
32790 const int LoLUT[16] = {
32791 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32792 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32793 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32794 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32795 const int HiLUT[16] = {
32796 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32797 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32798 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32799 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32800
32801 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32802 for (unsigned i = 0; i < NumElts; ++i) {
32803 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32804 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32805 }
32806
32807 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32808 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32809 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32810 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32811 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32812}
32813
32814static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32815 SelectionDAG &DAG) {
32816 SDLoc DL(Op);
32817 SDValue X = Op.getOperand(0);
32818 MVT VT = Op.getSimpleValueType();
32819
32820 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32821 if (VT == MVT::i8 ||
32823 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32824 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32825 DAG.getConstant(0, DL, MVT::i8));
32826 // Copy the inverse of the parity flag into a register with setcc.
32827 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32828 // Extend to the original type.
32829 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32830 }
32831
32832 // If we have POPCNT, use the default expansion.
32833 if (Subtarget.hasPOPCNT())
32834 return SDValue();
32835
32836 if (VT == MVT::i64) {
32837 // Xor the high and low 16-bits together using a 32-bit operation.
32838 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32839 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32840 DAG.getConstant(32, DL, MVT::i8)));
32841 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32842 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32843 }
32844
32845 if (VT != MVT::i16) {
32846 // Xor the high and low 16-bits together using a 32-bit operation.
32847 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32848 DAG.getConstant(16, DL, MVT::i8));
32849 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32850 } else {
32851 // If the input is 16-bits, we need to extend to use an i32 shift below.
32852 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32853 }
32854
32855 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32856 // This should allow an h-reg to be used to save a shift.
32857 SDValue Hi = DAG.getNode(
32858 ISD::TRUNCATE, DL, MVT::i8,
32859 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32860 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32861 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32862 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32863
32864 // Copy the inverse of the parity flag into a register with setcc.
32865 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32866 // Extend to the original type.
32867 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32868}
32869
32871 const X86Subtarget &Subtarget) {
32872 unsigned NewOpc = 0;
32873 switch (N->getOpcode()) {
32874 case ISD::ATOMIC_LOAD_ADD:
32875 NewOpc = X86ISD::LADD;
32876 break;
32877 case ISD::ATOMIC_LOAD_SUB:
32878 NewOpc = X86ISD::LSUB;
32879 break;
32880 case ISD::ATOMIC_LOAD_OR:
32881 NewOpc = X86ISD::LOR;
32882 break;
32883 case ISD::ATOMIC_LOAD_XOR:
32884 NewOpc = X86ISD::LXOR;
32885 break;
32886 case ISD::ATOMIC_LOAD_AND:
32887 NewOpc = X86ISD::LAND;
32888 break;
32889 default:
32890 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32891 }
32892
32893 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32894
32895 return DAG.getMemIntrinsicNode(
32896 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32897 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32898 /*MemVT=*/N->getSimpleValueType(0), MMO);
32899}
32900
32901/// Lower atomic_load_ops into LOCK-prefixed operations.
32903 const X86Subtarget &Subtarget) {
32904 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32905 SDValue Chain = N->getOperand(0);
32906 SDValue LHS = N->getOperand(1);
32907 SDValue RHS = N->getOperand(2);
32908 unsigned Opc = N->getOpcode();
32909 MVT VT = N->getSimpleValueType(0);
32910 SDLoc DL(N);
32911
32912 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32913 // can only be lowered when the result is unused. They should have already
32914 // been transformed into a cmpxchg loop in AtomicExpand.
32915 if (N->hasAnyUseOfValue(0)) {
32916 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32917 // select LXADD if LOCK_SUB can't be selected.
32918 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32919 // can use LXADD as opposed to cmpxchg.
32920 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32921 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32922 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32923 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32924
32925 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32926 "Used AtomicRMW ops other than Add should have been expanded!");
32927 return N;
32928 }
32929
32930 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32931 // The core idea here is that since the memory location isn't actually
32932 // changing, all we need is a lowering for the *ordering* impacts of the
32933 // atomicrmw. As such, we can chose a different operation and memory
32934 // location to minimize impact on other code.
32935 // The above holds unless the node is marked volatile in which
32936 // case it needs to be preserved according to the langref.
32937 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32938 // On X86, the only ordering which actually requires an instruction is
32939 // seq_cst which isn't SingleThread, everything just needs to be preserved
32940 // during codegen and then dropped. Note that we expect (but don't assume),
32941 // that orderings other than seq_cst and acq_rel have been canonicalized to
32942 // a store or load.
32945 // Prefer a locked operation against a stack location to minimize cache
32946 // traffic. This assumes that stack locations are very likely to be
32947 // accessed only by the owning thread.
32948 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32949 assert(!N->hasAnyUseOfValue(0));
32950 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32951 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32952 DAG.getUNDEF(VT), NewChain);
32953 }
32954 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32955 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32956 assert(!N->hasAnyUseOfValue(0));
32957 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32958 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32959 DAG.getUNDEF(VT), NewChain);
32960 }
32961
32962 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32963 // RAUW the chain, but don't worry about the result, as it's unused.
32964 assert(!N->hasAnyUseOfValue(0));
32965 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32966 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32967 DAG.getUNDEF(VT), LockOp.getValue(1));
32968}
32969
32971 const X86Subtarget &Subtarget) {
32972 auto *Node = cast<AtomicSDNode>(Op.getNode());
32973 SDLoc dl(Node);
32974 EVT VT = Node->getMemoryVT();
32975
32976 bool IsSeqCst =
32977 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32978 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32979
32980 // If this store is not sequentially consistent and the type is legal
32981 // we can just keep it.
32982 if (!IsSeqCst && IsTypeLegal)
32983 return Op;
32984
32985 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32987 Attribute::NoImplicitFloat)) {
32988 SDValue Chain;
32989 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32990 // vector store.
32991 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32992 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32993 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32994 Node->getMemOperand());
32995 }
32996
32997 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32998 // is enabled.
32999 if (VT == MVT::i64) {
33000 if (Subtarget.hasSSE1()) {
33001 SDValue SclToVec =
33002 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
33003 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33004 SclToVec = DAG.getBitcast(StVT, SclToVec);
33005 SDVTList Tys = DAG.getVTList(MVT::Other);
33006 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33007 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33008 MVT::i64, Node->getMemOperand());
33009 } else if (Subtarget.hasX87()) {
33010 // First load this into an 80-bit X87 register using a stack temporary.
33011 // This will put the whole integer into the significand.
33012 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33013 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33014 MachinePointerInfo MPI =
33016 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33018 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33019 SDValue LdOps[] = {Chain, StackPtr};
33021 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33022 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33023 Chain = Value.getValue(1);
33024
33025 // Now use an FIST to do the atomic store.
33026 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33027 Chain =
33028 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33029 StoreOps, MVT::i64, Node->getMemOperand());
33030 }
33031 }
33032
33033 if (Chain) {
33034 // If this is a sequentially consistent store, also emit an appropriate
33035 // barrier.
33036 if (IsSeqCst)
33037 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33038
33039 return Chain;
33040 }
33041 }
33042
33043 // Convert seq_cst store -> xchg
33044 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33045 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33046 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33047 Node->getOperand(0), Node->getOperand(2),
33048 Node->getOperand(1), Node->getMemOperand());
33049 return Swap.getValue(1);
33050}
33051
33053 SDNode *N = Op.getNode();
33054 MVT VT = N->getSimpleValueType(0);
33055 unsigned Opc = Op.getOpcode();
33056
33057 // Let legalize expand this if it isn't a legal type yet.
33058 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33059 return SDValue();
33060
33061 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33062 SDLoc DL(N);
33063
33064 // Set the carry flag.
33065 SDValue Carry = Op.getOperand(2);
33066 EVT CarryVT = Carry.getValueType();
33067 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33068 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33069
33070 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33071 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33072 Op.getOperand(0), Op.getOperand(1),
33073 Carry.getValue(1));
33074
33075 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33076 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33077 Sum.getValue(1), DL, DAG);
33078 if (N->getValueType(1) == MVT::i1)
33079 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33080
33081 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33082}
33083
33084static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33085 SelectionDAG &DAG) {
33086 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33087
33088 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33089 // which returns the values as { float, float } (in XMM0) or
33090 // { double, double } (which is returned in XMM0, XMM1).
33091 SDLoc dl(Op);
33092 SDValue Arg = Op.getOperand(0);
33093 EVT ArgVT = Arg.getValueType();
33094 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33095
33097 Args.emplace_back(Arg, ArgTy);
33098
33099 bool isF64 = ArgVT == MVT::f64;
33100 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33101 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33102 // the results are returned via SRet in memory.
33103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33104 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33105 const char *LibcallName = TLI.getLibcallName(LC);
33106 SDValue Callee =
33107 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33108
33109 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33110 : (Type *)FixedVectorType::get(ArgTy, 4);
33111
33113 CLI.setDebugLoc(dl)
33114 .setChain(DAG.getEntryNode())
33115 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33116
33117 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33118
33119 if (isF64)
33120 // Returned in xmm0 and xmm1.
33121 return CallResult.first;
33122
33123 // Returned in bits 0:31 and 32:64 xmm0.
33124 SDValue SinVal =
33125 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33126 DAG.getVectorIdxConstant(0, dl));
33127 SDValue CosVal =
33128 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33129 DAG.getVectorIdxConstant(1, dl));
33130 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33131 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33132}
33133
33134/// Widen a vector input to a vector of NVT. The
33135/// input vector must have the same element type as NVT.
33137 bool FillWithZeroes = false) {
33138 // Check if InOp already has the right width.
33139 MVT InVT = InOp.getSimpleValueType();
33140 if (InVT == NVT)
33141 return InOp;
33142
33143 if (InOp.isUndef())
33144 return DAG.getUNDEF(NVT);
33145
33147 "input and widen element type must match");
33148
33149 unsigned InNumElts = InVT.getVectorNumElements();
33150 unsigned WidenNumElts = NVT.getVectorNumElements();
33151 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33152 "Unexpected request for vector widening");
33153
33154 SDLoc dl(InOp);
33155 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33156 SDValue N1 = InOp.getOperand(1);
33157 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33158 N1.isUndef()) {
33159 InOp = InOp.getOperand(0);
33160 InVT = InOp.getSimpleValueType();
33161 InNumElts = InVT.getVectorNumElements();
33162 }
33163 }
33166 EVT EltVT = InOp.getOperand(0).getValueType();
33167 SDValue FillVal =
33168 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33170 Ops.append(WidenNumElts - InNumElts, FillVal);
33171 return DAG.getBuildVector(NVT, dl, Ops);
33172 }
33173 SDValue FillVal =
33174 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33176 DAG.getVectorIdxConstant(0, dl));
33177}
33178
33180 SelectionDAG &DAG) {
33181 assert(Subtarget.hasAVX512() &&
33182 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33183
33185 SDValue Src = N->getValue();
33186 MVT VT = Src.getSimpleValueType();
33187 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33188 SDLoc dl(Op);
33189
33190 SDValue Scale = N->getScale();
33191 SDValue Index = N->getIndex();
33192 SDValue Mask = N->getMask();
33193 SDValue Chain = N->getChain();
33194 SDValue BasePtr = N->getBasePtr();
33195
33196 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33197 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33198 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33199 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33200 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33201 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33202 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33203 SDVTList VTs = DAG.getVTList(MVT::Other);
33204 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33205 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33206 N->getMemoryVT(), N->getMemOperand());
33207 }
33208 return SDValue();
33209 }
33210
33211 MVT IndexVT = Index.getSimpleValueType();
33212
33213 // If the index is v2i32, we're being called by type legalization and we
33214 // should just let the default handling take care of it.
33215 if (IndexVT == MVT::v2i32)
33216 return SDValue();
33217
33218 // If we don't have VLX and neither the passthru or index is 512-bits, we
33219 // need to widen until one is.
33220 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33221 !Index.getSimpleValueType().is512BitVector()) {
33222 // Determine how much we need to widen by to get a 512-bit type.
33223 unsigned Factor = std::min(512/VT.getSizeInBits(),
33224 512/IndexVT.getSizeInBits());
33225 unsigned NumElts = VT.getVectorNumElements() * Factor;
33226
33227 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33228 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33229 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33230
33231 Src = ExtendToType(Src, VT, DAG);
33232 Index = ExtendToType(Index, IndexVT, DAG);
33233 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33234 }
33235
33236 SDVTList VTs = DAG.getVTList(MVT::Other);
33237 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33238 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33239 N->getMemoryVT(), N->getMemOperand());
33240}
33241
33242static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33243 SelectionDAG &DAG) {
33244
33246 MVT VT = Op.getSimpleValueType();
33247 MVT ScalarVT = VT.getScalarType();
33248 SDValue Mask = N->getMask();
33249 MVT MaskVT = Mask.getSimpleValueType();
33250 SDValue PassThru = N->getPassThru();
33251 SDLoc dl(Op);
33252
33253 // Handle AVX masked loads which don't support passthru other than 0.
33254 if (MaskVT.getVectorElementType() != MVT::i1) {
33255 // We also allow undef in the isel pattern.
33256 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33257 return Op;
33258
33259 SDValue NewLoad = DAG.getMaskedLoad(
33260 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33261 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33262 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33263 N->isExpandingLoad());
33264 // Emit a blend.
33265 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33266 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33267 }
33268
33269 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33270 "Expanding masked load is supported on AVX-512 target only!");
33271
33272 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33273 "Expanding masked load is supported for 32 and 64-bit types only!");
33274
33275 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33276 "Cannot lower masked load op.");
33277
33278 assert((ScalarVT.getSizeInBits() >= 32 ||
33279 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33280 ScalarVT == MVT::f16))) &&
33281 "Unsupported masked load op.");
33282
33283 // This operation is legal for targets with VLX, but without
33284 // VLX the vector should be widened to 512 bit
33285 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33286 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33287 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33288
33289 // Mask element has to be i1.
33290 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33291 "Unexpected mask type");
33292
33293 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33294
33295 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33296 SDValue NewLoad = DAG.getMaskedLoad(
33297 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33298 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33299 N->getExtensionType(), N->isExpandingLoad());
33300
33301 SDValue Extract =
33302 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33303 DAG.getVectorIdxConstant(0, dl));
33304 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33305 return DAG.getMergeValues(RetOps, dl);
33306}
33307
33308static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33309 SelectionDAG &DAG) {
33311 SDValue DataToStore = N->getValue();
33312 MVT VT = DataToStore.getSimpleValueType();
33313 MVT ScalarVT = VT.getScalarType();
33314 SDValue Mask = N->getMask();
33315 SDLoc dl(Op);
33316
33317 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33318 "Expanding masked load is supported on AVX-512 target only!");
33319
33320 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33321 "Expanding masked load is supported for 32 and 64-bit types only!");
33322
33323 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33324 "Cannot lower masked store op.");
33325
33326 assert((ScalarVT.getSizeInBits() >= 32 ||
33327 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33328 ScalarVT == MVT::f16))) &&
33329 "Unsupported masked store op.");
33330
33331 // This operation is legal for targets with VLX, but without
33332 // VLX the vector should be widened to 512 bit
33333 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33334 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33335
33336 // Mask element has to be i1.
33337 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33338 "Unexpected mask type");
33339
33340 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33341
33342 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33343 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33344 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33345 N->getOffset(), Mask, N->getMemoryVT(),
33346 N->getMemOperand(), N->getAddressingMode(),
33347 N->isTruncatingStore(), N->isCompressingStore());
33348}
33349
33350static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33351 SelectionDAG &DAG) {
33352 assert(Subtarget.hasAVX2() &&
33353 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33354
33356 SDLoc dl(Op);
33357 MVT VT = Op.getSimpleValueType();
33358 SDValue Index = N->getIndex();
33359 SDValue Mask = N->getMask();
33360 SDValue PassThru = N->getPassThru();
33361 MVT IndexVT = Index.getSimpleValueType();
33362
33363 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33364
33365 // If the index is v2i32, we're being called by type legalization.
33366 if (IndexVT == MVT::v2i32)
33367 return SDValue();
33368
33369 // If we don't have VLX and neither the passthru or index is 512-bits, we
33370 // need to widen until one is.
33371 MVT OrigVT = VT;
33372 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33373 !IndexVT.is512BitVector()) {
33374 // Determine how much we need to widen by to get a 512-bit type.
33375 unsigned Factor = std::min(512/VT.getSizeInBits(),
33376 512/IndexVT.getSizeInBits());
33377
33378 unsigned NumElts = VT.getVectorNumElements() * Factor;
33379
33380 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33381 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33382 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33383
33384 PassThru = ExtendToType(PassThru, VT, DAG);
33385 Index = ExtendToType(Index, IndexVT, DAG);
33386 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33387 }
33388
33389 // Break dependency on the data register.
33390 if (PassThru.isUndef())
33391 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33392
33393 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33394 N->getScale() };
33395 SDValue NewGather = DAG.getMemIntrinsicNode(
33396 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33397 N->getMemOperand());
33398 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33399 DAG.getVectorIdxConstant(0, dl));
33400 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33401}
33402
33404 SDLoc dl(Op);
33405 SDValue Src = Op.getOperand(0);
33406 MVT DstVT = Op.getSimpleValueType();
33407
33409 unsigned SrcAS = N->getSrcAddressSpace();
33410
33411 assert(SrcAS != N->getDestAddressSpace() &&
33412 "addrspacecast must be between different address spaces");
33413
33414 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33415 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33416 } else if (DstVT == MVT::i64) {
33417 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33418 } else if (DstVT == MVT::i32) {
33419 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33420 } else {
33421 report_fatal_error("Bad address space in addrspacecast");
33422 }
33423 return Op;
33424}
33425
33426SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33427 SelectionDAG &DAG) const {
33428 // TODO: Eventually, the lowering of these nodes should be informed by or
33429 // deferred to the GC strategy for the function in which they appear. For
33430 // now, however, they must be lowered to something. Since they are logically
33431 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33432 // require special handling for these nodes), lower them as literal NOOPs for
33433 // the time being.
33435 Ops.push_back(Op.getOperand(0));
33436 if (Op->getGluedNode())
33437 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33438
33439 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33440 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33441}
33442
33443// Custom split CVTPS2PH with wide types.
33445 SDLoc dl(Op);
33446 EVT VT = Op.getValueType();
33447 SDValue Lo, Hi;
33448 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33449 EVT LoVT, HiVT;
33450 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33451 SDValue RC = Op.getOperand(1);
33452 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33453 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33454 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33455}
33456
33458 SelectionDAG &DAG) {
33459 unsigned IsData = Op.getConstantOperandVal(4);
33460
33461 // We don't support non-data prefetch without PREFETCHI.
33462 // Just preserve the chain.
33463 if (!IsData && !Subtarget.hasPREFETCHI())
33464 return Op.getOperand(0);
33465
33466 return Op;
33467}
33468
33470 SDNode *N = Op.getNode();
33471 SDValue Operand = N->getOperand(0);
33472 EVT VT = Operand.getValueType();
33473 SDLoc dl(N);
33474
33475 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33476
33477 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33478 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33479 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33480 // promote this operator's result!
33481 SDValue Chain = DAG.getEntryNode();
33482 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33483 {Chain, Operand, One});
33484 return StrictFmul;
33485}
33486
33488 unsigned OpNo) {
33489 const APInt Operand(32, OpNo);
33490 std::string OpNoStr = llvm::toString(Operand, 10, false);
33491 std::string Str(" $");
33492
33493 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33494 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33495
33496 auto I = StringRef::npos;
33497 for (auto &AsmStr : AsmStrs) {
33498 // Match the OpNo string. We should match exactly to exclude match
33499 // sub-string, e.g. "$12" contain "$1"
33500 if (AsmStr.ends_with(OpNoStr1))
33501 I = AsmStr.size() - OpNoStr1.size();
33502
33503 // Get the index of operand in AsmStr.
33504 if (I == StringRef::npos)
33505 I = AsmStr.find(OpNoStr1 + ",");
33506 if (I == StringRef::npos)
33507 I = AsmStr.find(OpNoStr2);
33508
33509 if (I == StringRef::npos)
33510 continue;
33511
33512 assert(I > 0 && "Unexpected inline asm string!");
33513 // Remove the operand string and label (if exsit).
33514 // For example:
33515 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33516 // ==>
33517 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33518 // ==>
33519 // "call dword ptr "
33520 auto TmpStr = AsmStr.substr(0, I);
33521 I = TmpStr.rfind(':');
33522 if (I != StringRef::npos)
33523 TmpStr = TmpStr.substr(I + 1);
33524 return TmpStr.take_while(llvm::isAlpha);
33525 }
33526
33527 return StringRef();
33528}
33529
33531 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33532 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33533 // changed from indirect TargetLowering::C_Memory to direct
33534 // TargetLowering::C_Address.
33535 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33536 // location.
33537 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33538 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33539}
33540
33542 SDValue Mask) {
33543 EVT Ty = MVT::i8;
33544 auto V = DAG.getBitcast(MVT::i1, Mask);
33545 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33546 auto Zero = DAG.getConstant(0, DL, Ty);
33547 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33548 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33549 return SDValue(CmpZero.getNode(), 1);
33550}
33551
33553 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33554 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33555 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33556 // ->
33557 // _, flags = SUB 0, mask
33558 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33559 // bit_cast_to_vector<res>
33560 EVT VTy = PassThru.getValueType();
33561 EVT Ty = VTy.getVectorElementType();
33562 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33563 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33564 : DAG.getBitcast(Ty, PassThru);
33565 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33566 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33567 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33568 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33569 return DAG.getBitcast(VTy, NewLoad);
33570}
33571
33573 SDValue Chain,
33575 SDValue Val, SDValue Mask) const {
33576 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33577 // ->
33578 // _, flags = SUB 0, mask
33579 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33581 SDVTList Tys = DAG.getVTList(MVT::Other);
33582 auto ScalarVal = DAG.getBitcast(Ty, Val);
33583 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33584 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33585 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33586 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33587}
33588
33589/// Provide custom lowering hooks for some operations.
33591 switch (Op.getOpcode()) {
33592 // clang-format off
33593 default: llvm_unreachable("Should not custom lower this!");
33594 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33595 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33596 return LowerCMP_SWAP(Op, Subtarget, DAG);
33597 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33598 case ISD::ATOMIC_LOAD_ADD:
33599 case ISD::ATOMIC_LOAD_SUB:
33600 case ISD::ATOMIC_LOAD_OR:
33601 case ISD::ATOMIC_LOAD_XOR:
33602 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33603 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33604 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33605 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33606 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33607 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33608 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33609 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33610 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33611 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33612 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33613 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33614 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33615 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33616 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33617 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33618 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33619 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33620 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33621 case ISD::SHL_PARTS:
33622 case ISD::SRA_PARTS:
33623 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33624 case ISD::FSHL:
33625 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33626 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33628 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33630 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33631 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33632 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33633 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33634 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33637 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33638 case ISD::FP_TO_SINT:
33640 case ISD::FP_TO_UINT:
33641 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33643 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33644 case ISD::FP_EXTEND:
33645 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33646 case ISD::FP_ROUND:
33647 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33648 case ISD::FP16_TO_FP:
33649 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33650 case ISD::FP_TO_FP16:
33651 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33652 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33653 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33654 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33655 case ISD::FADD:
33656 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33657 case ISD::FROUND: return LowerFROUND(Op, DAG);
33658 case ISD::FABS:
33659 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33660 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33661 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33662 case ISD::LRINT:
33663 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33664 case ISD::SETCC:
33665 case ISD::STRICT_FSETCC:
33666 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33667 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33668 case ISD::SELECT: return LowerSELECT(Op, DAG);
33669 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33670 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33671 case ISD::VASTART: return LowerVASTART(Op, DAG);
33672 case ISD::VAARG: return LowerVAARG(Op, DAG);
33673 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33674 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33676 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33677 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33678 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33679 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33681 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33682 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33683 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33684 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33685 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33687 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33688 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33689 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33690 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33691 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33692 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33693 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33694 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33695 case ISD::CTLZ:
33696 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33697 case ISD::CTTZ:
33698 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33699 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33700 case ISD::MULHS:
33701 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33702 case ISD::ROTL:
33703 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33704 case ISD::SRA:
33705 case ISD::SRL:
33706 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33707 case ISD::SADDO:
33708 case ISD::UADDO:
33709 case ISD::SSUBO:
33710 case ISD::USUBO: return LowerXALUO(Op, DAG);
33711 case ISD::SMULO:
33712 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33713 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33714 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33715 case ISD::SADDO_CARRY:
33716 case ISD::SSUBO_CARRY:
33717 case ISD::UADDO_CARRY:
33718 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33719 case ISD::ADD:
33720 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33721 case ISD::UADDSAT:
33722 case ISD::SADDSAT:
33723 case ISD::USUBSAT:
33724 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33725 case ISD::SMAX:
33726 case ISD::SMIN:
33727 case ISD::UMAX:
33728 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33729 case ISD::FMINIMUM:
33730 case ISD::FMAXIMUM:
33731 case ISD::FMINIMUMNUM:
33732 case ISD::FMAXIMUMNUM:
33733 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33734 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33735 case ISD::ABDS:
33736 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33737 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33738 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33739 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33740 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33741 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33742 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33743 case ISD::GC_TRANSITION_START:
33744 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33745 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33746 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33747 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33748 // clang-format on
33749 }
33750}
33751
33752/// Replace a node with an illegal result type with a new node built out of
33753/// custom code.
33756 SelectionDAG &DAG) const {
33757 SDLoc dl(N);
33758 unsigned Opc = N->getOpcode();
33759 switch (Opc) {
33760 default:
33761#ifndef NDEBUG
33762 dbgs() << "ReplaceNodeResults: ";
33763 N->dump(&DAG);
33764#endif
33765 llvm_unreachable("Do not know how to custom type legalize this operation!");
33766 case X86ISD::CVTPH2PS: {
33767 EVT VT = N->getValueType(0);
33768 SDValue Lo, Hi;
33769 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33770 EVT LoVT, HiVT;
33771 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33772 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33773 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33774 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33775 Results.push_back(Res);
33776 return;
33777 }
33779 EVT VT = N->getValueType(0);
33780 SDValue Lo, Hi;
33781 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33782 EVT LoVT, HiVT;
33783 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33784 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33785 {N->getOperand(0), Lo});
33786 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33787 {N->getOperand(0), Hi});
33788 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33789 Lo.getValue(1), Hi.getValue(1));
33790 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33791 Results.push_back(Res);
33792 Results.push_back(Chain);
33793 return;
33794 }
33795 case X86ISD::CVTPS2PH:
33796 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33797 return;
33798 case ISD::CTPOP: {
33799 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33800 // If we have at most 32 active bits, then perform as i32 CTPOP.
33801 // TODO: Perform this in generic legalizer?
33802 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33803 unsigned LZ = Known.countMinLeadingZeros();
33804 unsigned TZ = Known.countMinTrailingZeros();
33805 if ((LZ + TZ) >= 32) {
33806 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33807 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33808 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33809 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33810 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33811 Results.push_back(Op);
33812 return;
33813 }
33814 // Use a v2i64 if possible.
33815 bool NoImplicitFloatOps =
33817 Attribute::NoImplicitFloat);
33818 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33819 SDValue Wide =
33820 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33821 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33822 // Bit count should fit in 32-bits, extract it as that and then zero
33823 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33824 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33825 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33826 DAG.getVectorIdxConstant(0, dl));
33827 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33828 Results.push_back(Wide);
33829 }
33830 return;
33831 }
33832 case ISD::MUL: {
33833 EVT VT = N->getValueType(0);
33835 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33836 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33837 // elements are needed.
33838 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33839 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33840 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33841 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33842 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33843 unsigned NumConcats = 16 / VT.getVectorNumElements();
33844 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33845 ConcatOps[0] = Res;
33846 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33847 Results.push_back(Res);
33848 return;
33849 }
33850 case ISD::SMULO:
33851 case ISD::UMULO: {
33852 EVT VT = N->getValueType(0);
33854 VT == MVT::v2i32 && "Unexpected VT!");
33855 bool IsSigned = Opc == ISD::SMULO;
33856 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33857 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33858 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33859 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33860 // Extract the high 32 bits from each result using PSHUFD.
33861 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33862 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33863 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33864 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33865 DAG.getVectorIdxConstant(0, dl));
33866
33867 // Truncate the low bits of the result. This will become PSHUFD.
33868 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33869
33870 SDValue HiCmp;
33871 if (IsSigned) {
33872 // SMULO overflows if the high bits don't match the sign of the low.
33873 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33874 } else {
33875 // UMULO overflows if the high bits are non-zero.
33876 HiCmp = DAG.getConstant(0, dl, VT);
33877 }
33878 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33879
33880 // Widen the result with by padding with undef.
33881 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33882 DAG.getUNDEF(VT));
33883 Results.push_back(Res);
33884 Results.push_back(Ovf);
33885 return;
33886 }
33887 case X86ISD::VPMADDWD: {
33888 // Legalize types for X86ISD::VPMADDWD by widening.
33889 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33890
33891 EVT VT = N->getValueType(0);
33892 EVT InVT = N->getOperand(0).getValueType();
33893 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33894 "Expected a VT that divides into 128 bits.");
33896 "Unexpected type action!");
33897 unsigned NumConcat = 128 / InVT.getSizeInBits();
33898
33899 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33900 InVT.getVectorElementType(),
33901 NumConcat * InVT.getVectorNumElements());
33902 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33904 NumConcat * VT.getVectorNumElements());
33905
33906 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33907 Ops[0] = N->getOperand(0);
33908 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33909 Ops[0] = N->getOperand(1);
33910 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33911
33912 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33913 Results.push_back(Res);
33914 return;
33915 }
33916 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33917 case X86ISD::FMINC:
33918 case X86ISD::FMIN:
33919 case X86ISD::FMAXC:
33920 case X86ISD::FMAX:
33922 case X86ISD::STRICT_FMAX: {
33923 EVT VT = N->getValueType(0);
33924 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33925 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33926 SDValue UNDEF = DAG.getUNDEF(VT);
33927 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33928 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33929 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33930 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33931 SDValue Res;
33932 if (IsStrict)
33933 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33934 {N->getOperand(0), LHS, RHS});
33935 else
33936 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33937 Results.push_back(Res);
33938 if (IsStrict)
33939 Results.push_back(Res.getValue(1));
33940 return;
33941 }
33942 case ISD::SDIV:
33943 case ISD::UDIV:
33944 case ISD::SREM:
33945 case ISD::UREM: {
33946 EVT VT = N->getValueType(0);
33947 if (VT.isVector()) {
33949 "Unexpected type action!");
33950 // If this RHS is a constant splat vector we can widen this and let
33951 // division/remainder by constant optimize it.
33952 // TODO: Can we do something for non-splat?
33953 APInt SplatVal;
33954 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33955 unsigned NumConcats = 128 / VT.getSizeInBits();
33956 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33957 Ops0[0] = N->getOperand(0);
33958 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33959 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33960 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33961 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33962 Results.push_back(Res);
33963 }
33964 return;
33965 }
33966
33967 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33968 Results.push_back(V);
33969 return;
33970 }
33971 case ISD::TRUNCATE: {
33972 MVT VT = N->getSimpleValueType(0);
33973 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33974 return;
33975
33976 // The generic legalizer will try to widen the input type to the same
33977 // number of elements as the widened result type. But this isn't always
33978 // the best thing so do some custom legalization to avoid some cases.
33979 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33980 SDValue In = N->getOperand(0);
33981 EVT InVT = In.getValueType();
33982 EVT InEltVT = InVT.getVectorElementType();
33983 EVT EltVT = VT.getVectorElementType();
33984 unsigned MinElts = VT.getVectorNumElements();
33985 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33986 unsigned InBits = InVT.getSizeInBits();
33987
33988 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33989 unsigned PackOpcode;
33990 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33991 Subtarget, N->getFlags())) {
33992 if (SDValue Res =
33993 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33994 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33995 Results.push_back(Res);
33996 return;
33997 }
33998 }
33999
34000 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
34001 // 128 bit and smaller inputs should avoid truncate all together and
34002 // use a shuffle.
34003 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
34004 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
34005 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
34006 for (unsigned I = 0; I < MinElts; ++I)
34007 TruncMask[I] = Scale * I;
34008 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
34009 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
34010 "Illegal vector type in truncation");
34011 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
34012 Results.push_back(
34013 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
34014 return;
34015 }
34016 }
34017
34018 // With AVX512 there are some cases that can use a target specific
34019 // truncate node to go from 256/512 to less than 128 with zeros in the
34020 // upper elements of the 128 bit result.
34021 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34022 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34023 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34024 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34025 return;
34026 }
34027 // There's one case we can widen to 512 bits and use VTRUNC.
34028 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34029 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34030 DAG.getUNDEF(MVT::v4i64));
34031 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34032 return;
34033 }
34034 }
34035 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34036 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34037 isTypeLegal(MVT::v4i64)) {
34038 // Input needs to be split and output needs to widened. Let's use two
34039 // VTRUNCs, and shuffle their results together into the wider type.
34040 SDValue Lo, Hi;
34041 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34042
34043 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34044 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34045 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34046 { 0, 1, 2, 3, 16, 17, 18, 19,
34047 -1, -1, -1, -1, -1, -1, -1, -1 });
34048 Results.push_back(Res);
34049 return;
34050 }
34051
34052 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34053 // this via type legalization.
34054 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34055 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34056 (!Subtarget.hasSSSE3() ||
34057 (!isTypeLegal(InVT) &&
34058 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34059 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34060 InEltVT.getSizeInBits() * WidenNumElts);
34061 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34062 return;
34063 }
34064
34065 return;
34066 }
34067 case ISD::ANY_EXTEND:
34068 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34069 // It's intended to custom handle the input type.
34070 assert(N->getValueType(0) == MVT::v8i8 &&
34071 "Do not know how to legalize this Node");
34072 return;
34073 case ISD::SIGN_EXTEND:
34074 case ISD::ZERO_EXTEND: {
34075 EVT VT = N->getValueType(0);
34076 SDValue In = N->getOperand(0);
34077 EVT InVT = In.getValueType();
34078 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34079 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34081 "Unexpected type action!");
34082 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34083 // Custom split this so we can extend i8/i16->i32 invec. This is better
34084 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34085 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34086 // we allow the sra from the extend to i32 to be shared by the split.
34087 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34088
34089 // Fill a vector with sign bits for each element.
34090 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34091 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34092
34093 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34094 // to v2i64.
34095 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34096 {0, 4, 1, 5});
34097 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34098 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34099 {2, 6, 3, 7});
34100 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34101
34102 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34103 Results.push_back(Res);
34104 return;
34105 }
34106
34107 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34108 if (!InVT.is128BitVector()) {
34109 // Not a 128 bit vector, but maybe type legalization will promote
34110 // it to 128 bits.
34111 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34112 return;
34113 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34114 if (!InVT.is128BitVector())
34115 return;
34116
34117 // Promote the input to 128 bits. Type legalization will turn this into
34118 // zext_inreg/sext_inreg.
34119 In = DAG.getNode(Opc, dl, InVT, In);
34120 }
34121
34122 // Perform custom splitting instead of the two stage extend we would get
34123 // by default.
34124 EVT LoVT, HiVT;
34125 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34126 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34127
34128 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34129
34130 // We need to shift the input over by half the number of elements.
34131 unsigned NumElts = InVT.getVectorNumElements();
34132 unsigned HalfNumElts = NumElts / 2;
34133 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34134 for (unsigned i = 0; i != HalfNumElts; ++i)
34135 ShufMask[i] = i + HalfNumElts;
34136
34137 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34138 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34139
34140 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34141 Results.push_back(Res);
34142 }
34143 return;
34144 }
34146 case ISD::FP_TO_UINT_SAT: {
34147 if (!Subtarget.hasAVX10_2())
34148 return;
34149
34150 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34151 EVT VT = N->getValueType(0);
34152 SDValue Op = N->getOperand(0);
34153 EVT OpVT = Op.getValueType();
34154 SDValue Res;
34155
34156 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34157 if (IsSigned)
34158 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34159 else
34160 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34161 Results.push_back(Res);
34162 }
34163 return;
34164 }
34165 case ISD::FP_TO_SINT:
34167 case ISD::FP_TO_UINT:
34169 bool IsStrict = N->isStrictFPOpcode();
34170 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34171 EVT VT = N->getValueType(0);
34172 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34173 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34174 EVT SrcVT = Src.getValueType();
34175
34176 SDValue Res;
34177 if (isSoftF16(SrcVT, Subtarget)) {
34178 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34179 if (IsStrict) {
34180 Res =
34181 DAG.getNode(Opc, dl, {VT, MVT::Other},
34182 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34183 {NVT, MVT::Other}, {Chain, Src})});
34184 Chain = Res.getValue(1);
34185 } else {
34186 Res =
34187 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34188 }
34189 Results.push_back(Res);
34190 if (IsStrict)
34191 Results.push_back(Chain);
34192
34193 return;
34194 }
34195
34196 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34197 SrcVT.getVectorElementType() == MVT::f16) {
34198 EVT EleVT = VT.getVectorElementType();
34199 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34200
34201 if (SrcVT != MVT::v8f16) {
34202 SDValue Tmp =
34203 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34204 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34205 Ops[0] = Src;
34206 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34207 }
34208
34209 if (IsStrict) {
34211 Res =
34212 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34213 Chain = Res.getValue(1);
34214 } else {
34215 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34216 Res = DAG.getNode(Opc, dl, ResVT, Src);
34217 }
34218
34219 // TODO: Need to add exception check code for strict FP.
34220 if (EleVT.getSizeInBits() < 16) {
34221 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34222 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34223
34224 // Now widen to 128 bits.
34225 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34226 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34227 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34228 ConcatOps[0] = Res;
34229 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34230 }
34231
34232 Results.push_back(Res);
34233 if (IsStrict)
34234 Results.push_back(Chain);
34235
34236 return;
34237 }
34238
34239 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34241 "Unexpected type action!");
34242
34243 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34244 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34245 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34247 SDValue Res;
34248 SDValue Chain;
34249 if (IsStrict) {
34250 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34251 {N->getOperand(0), Src});
34252 Chain = Res.getValue(1);
34253 } else
34254 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34255
34256 // Preserve what we know about the size of the original result. If the
34257 // result is v2i32, we have to manually widen the assert.
34258 if (PromoteVT == MVT::v2i32)
34259 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34260 DAG.getUNDEF(MVT::v2i32));
34261
34262 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34263 Res.getValueType(), Res,
34265
34266 if (PromoteVT == MVT::v2i32)
34267 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34268 DAG.getVectorIdxConstant(0, dl));
34269
34270 // Truncate back to the original width.
34271 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34272
34273 // Now widen to 128 bits.
34274 unsigned NumConcats = 128 / VT.getSizeInBits();
34276 VT.getVectorNumElements() * NumConcats);
34277 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34278 ConcatOps[0] = Res;
34279 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34280 Results.push_back(Res);
34281 if (IsStrict)
34282 Results.push_back(Chain);
34283 return;
34284 }
34285
34286
34287 if (VT == MVT::v2i32) {
34288 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34289 "Strict unsigned conversion requires AVX512");
34290 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34292 "Unexpected type action!");
34293 if (Src.getValueType() == MVT::v2f64) {
34294 if (!IsSigned && !Subtarget.hasAVX512()) {
34295 SDValue Res =
34296 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34297 Results.push_back(Res);
34298 return;
34299 }
34300
34301 if (IsStrict)
34303 else
34304 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34305
34306 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34307 if (!IsSigned && !Subtarget.hasVLX()) {
34308 // Otherwise we can defer to the generic legalizer which will widen
34309 // the input as well. This will be further widened during op
34310 // legalization to v8i32<-v8f64.
34311 // For strict nodes we'll need to widen ourselves.
34312 // FIXME: Fix the type legalizer to safely widen strict nodes?
34313 if (!IsStrict)
34314 return;
34315 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34316 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34317 Opc = N->getOpcode();
34318 }
34319 SDValue Res;
34320 SDValue Chain;
34321 if (IsStrict) {
34322 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34323 {N->getOperand(0), Src});
34324 Chain = Res.getValue(1);
34325 } else {
34326 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34327 }
34328 Results.push_back(Res);
34329 if (IsStrict)
34330 Results.push_back(Chain);
34331 return;
34332 }
34333
34334 // Custom widen strict v2f32->v2i32 by padding with zeros.
34335 // FIXME: Should generic type legalizer do this?
34336 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34337 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34338 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34339 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34340 {N->getOperand(0), Src});
34341 Results.push_back(Res);
34342 Results.push_back(Res.getValue(1));
34343 return;
34344 }
34345
34346 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34347 // so early out here.
34348 return;
34349 }
34350
34351 assert(!VT.isVector() && "Vectors should have been handled above!");
34352
34353 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34354 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34355 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34356 assert(!Subtarget.is64Bit() && "i64 should be legal");
34357 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34358 // If we use a 128-bit result we might need to use a target specific node.
34359 unsigned SrcElts =
34360 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34361 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34362 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34363 if (NumElts != SrcElts) {
34364 if (IsStrict)
34366 else
34367 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34368 }
34369
34370 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34371 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34372 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34373 ZeroIdx);
34374 SDValue Chain;
34375 if (IsStrict) {
34376 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34377 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34378 Chain = Res.getValue(1);
34379 } else
34380 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34381 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34382 Results.push_back(Res);
34383 if (IsStrict)
34384 Results.push_back(Chain);
34385 return;
34386 }
34387
34388 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34389 SDValue Chain;
34390 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34391 Results.push_back(V);
34392 if (IsStrict)
34393 Results.push_back(Chain);
34394 return;
34395 }
34396
34397 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34398 Results.push_back(V);
34399 if (IsStrict)
34400 Results.push_back(Chain);
34401 }
34402 return;
34403 }
34404 case ISD::LRINT:
34405 if (N->getValueType(0) == MVT::v2i32) {
34406 SDValue Src = N->getOperand(0);
34407 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34408 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34409 DAG.getUNDEF(MVT::v2f16));
34410 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34411 DAG.getUNDEF(MVT::v4f16));
34412 } else if (Src.getValueType() != MVT::v2f64) {
34413 return;
34414 }
34415 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34416 return;
34417 }
34418 [[fallthrough]];
34419 case ISD::LLRINT: {
34420 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34421 Results.push_back(V);
34422 return;
34423 }
34424
34425 case ISD::SINT_TO_FP:
34427 case ISD::UINT_TO_FP:
34429 bool IsStrict = N->isStrictFPOpcode();
34430 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34431 EVT VT = N->getValueType(0);
34432 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34433 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34434 Subtarget.hasVLX()) {
34435 if (Src.getValueType().getVectorElementType() == MVT::i16)
34436 return;
34437
34438 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34439 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34440 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34441 : DAG.getUNDEF(MVT::v2i32));
34442 if (IsStrict) {
34443 unsigned Opc =
34445 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34446 {N->getOperand(0), Src});
34447 Results.push_back(Res);
34448 Results.push_back(Res.getValue(1));
34449 } else {
34450 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34451 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34452 }
34453 return;
34454 }
34455 if (VT != MVT::v2f32)
34456 return;
34457 EVT SrcVT = Src.getValueType();
34458 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34459 if (IsStrict) {
34460 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34462 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34463 {N->getOperand(0), Src});
34464 Results.push_back(Res);
34465 Results.push_back(Res.getValue(1));
34466 } else {
34467 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34468 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34469 }
34470 return;
34471 }
34472 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34473 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34474 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34475 SDValue One = DAG.getConstant(1, dl, SrcVT);
34476 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34477 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34478 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34479 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34480 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34481 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34482 for (int i = 0; i != 2; ++i) {
34483 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34484 SignSrc, DAG.getVectorIdxConstant(i, dl));
34485 if (IsStrict)
34486 SignCvts[i] =
34487 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34488 {N->getOperand(0), Elt});
34489 else
34490 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34491 };
34492 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34493 SDValue Slow, Chain;
34494 if (IsStrict) {
34495 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34496 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34497 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34498 {Chain, SignCvt, SignCvt});
34499 Chain = Slow.getValue(1);
34500 } else {
34501 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34502 }
34503 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34504 IsNeg =
34505 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34506 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34507 Results.push_back(Cvt);
34508 if (IsStrict)
34509 Results.push_back(Chain);
34510 return;
34511 }
34512
34513 if (SrcVT != MVT::v2i32)
34514 return;
34515
34516 if (IsSigned || Subtarget.hasAVX512()) {
34517 if (!IsStrict)
34518 return;
34519
34520 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34521 // FIXME: Should generic type legalizer do this?
34522 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34523 DAG.getConstant(0, dl, MVT::v2i32));
34524 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34525 {N->getOperand(0), Src});
34526 Results.push_back(Res);
34527 Results.push_back(Res.getValue(1));
34528 return;
34529 }
34530
34531 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34532 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34533 SDValue VBias = DAG.getConstantFP(
34534 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34535 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34536 DAG.getBitcast(MVT::v2i64, VBias));
34537 Or = DAG.getBitcast(MVT::v2f64, Or);
34538 if (IsStrict) {
34539 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34540 {N->getOperand(0), Or, VBias});
34542 {MVT::v4f32, MVT::Other},
34543 {Sub.getValue(1), Sub});
34544 Results.push_back(Res);
34545 Results.push_back(Res.getValue(1));
34546 } else {
34547 // TODO: Are there any fast-math-flags to propagate here?
34548 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34549 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34550 }
34551 return;
34552 }
34554 case ISD::FP_ROUND: {
34555 bool IsStrict = N->isStrictFPOpcode();
34556 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34557 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34558 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34559 EVT SrcVT = Src.getValueType();
34560 EVT VT = N->getValueType(0);
34561 SDValue V;
34562 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34563 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34564 : DAG.getUNDEF(MVT::v2f32);
34565 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34566 }
34567 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34568 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34569 if (SrcVT.getVectorElementType() != MVT::f32)
34570 return;
34571
34572 if (IsStrict)
34573 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34574 {Chain, Src, Rnd});
34575 else
34576 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34577
34578 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34579 if (IsStrict)
34580 Results.push_back(V.getValue(1));
34581 return;
34582 }
34583 if (!isTypeLegal(Src.getValueType()))
34584 return;
34585 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34586 if (IsStrict)
34587 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34588 {Chain, Src});
34589 else
34590 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34591 Results.push_back(V);
34592 if (IsStrict)
34593 Results.push_back(V.getValue(1));
34594 return;
34595 }
34596 case ISD::FP_EXTEND:
34597 case ISD::STRICT_FP_EXTEND: {
34598 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34599 // No other ValueType for FP_EXTEND should reach this point.
34600 assert(N->getValueType(0) == MVT::v2f32 &&
34601 "Do not know how to legalize this Node");
34602 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34603 return;
34604 bool IsStrict = N->isStrictFPOpcode();
34605 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34606 if (Src.getValueType().getVectorElementType() != MVT::f16)
34607 return;
34608 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34609 : DAG.getUNDEF(MVT::v2f16);
34610 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34611 if (IsStrict)
34612 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34613 {N->getOperand(0), V});
34614 else
34615 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34616 Results.push_back(V);
34617 if (IsStrict)
34618 Results.push_back(V.getValue(1));
34619 return;
34620 }
34622 unsigned IntNo = N->getConstantOperandVal(1);
34623 switch (IntNo) {
34624 default : llvm_unreachable("Do not know how to custom type "
34625 "legalize this intrinsic operation!");
34626 case Intrinsic::x86_rdtsc:
34627 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34628 Results);
34629 case Intrinsic::x86_rdtscp:
34630 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34631 Results);
34632 case Intrinsic::x86_rdpmc:
34633 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34634 Results);
34635 return;
34636 case Intrinsic::x86_rdpru:
34637 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34638 Results);
34639 return;
34640 case Intrinsic::x86_xgetbv:
34641 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34642 Results);
34643 return;
34644 }
34645 }
34646 case ISD::READCYCLECOUNTER: {
34647 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34648 }
34649 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34650 EVT T = N->getValueType(0);
34651 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34652 bool Regs64bit = T == MVT::i128;
34653 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34654 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34655 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34656 SDValue cpInL, cpInH;
34657 std::tie(cpInL, cpInH) =
34658 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34659 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34660 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34661 cpInH =
34662 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34663 cpInH, cpInL.getValue(1));
34664 SDValue swapInL, swapInH;
34665 std::tie(swapInL, swapInH) =
34666 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34667 swapInH =
34668 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34669 swapInH, cpInH.getValue(1));
34670
34671 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34672 // until later. So we keep the RBX input in a vreg and use a custom
34673 // inserter.
34674 // Since RBX will be a reserved register the register allocator will not
34675 // make sure its value will be properly saved and restored around this
34676 // live-range.
34677 SDValue Result;
34678 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34679 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34680 if (Regs64bit) {
34681 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34682 swapInH.getValue(1)};
34683 Result =
34684 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34685 } else {
34686 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34687 swapInH.getValue(1));
34688 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34689 swapInL.getValue(1)};
34690 Result =
34691 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34692 }
34693
34694 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34695 Regs64bit ? X86::RAX : X86::EAX,
34696 HalfT, Result.getValue(1));
34697 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34698 Regs64bit ? X86::RDX : X86::EDX,
34699 HalfT, cpOutL.getValue(2));
34700 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34701
34702 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34703 MVT::i32, cpOutH.getValue(2));
34704 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34705 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34706
34707 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34708 Results.push_back(Success);
34709 Results.push_back(EFLAGS.getValue(1));
34710 return;
34711 }
34712 case ISD::ATOMIC_LOAD: {
34713 assert(
34714 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34715 "Unexpected VT!");
34716 bool NoImplicitFloatOps =
34718 Attribute::NoImplicitFloat);
34719 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34720 auto *Node = cast<AtomicSDNode>(N);
34721
34722 if (N->getValueType(0) == MVT::i128) {
34723 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34724 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34725 Node->getBasePtr(), Node->getMemOperand());
34726 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34727 DAG.getVectorIdxConstant(0, dl));
34728 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34729 DAG.getVectorIdxConstant(1, dl));
34730 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34731 {ResL, ResH}));
34732 Results.push_back(Ld.getValue(1));
34733 return;
34734 }
34735 break;
34736 }
34737 if (Subtarget.hasSSE1()) {
34738 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34739 // Then extract the lower 64-bits.
34740 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34741 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34742 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34744 MVT::i64, Node->getMemOperand());
34745 if (Subtarget.hasSSE2()) {
34746 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34747 DAG.getVectorIdxConstant(0, dl));
34748 Results.push_back(Res);
34749 Results.push_back(Ld.getValue(1));
34750 return;
34751 }
34752 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34753 // then casts to i64. This avoids a 128-bit stack temporary being
34754 // created by type legalization if we were to cast v4f32->v2i64.
34755 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34756 DAG.getVectorIdxConstant(0, dl));
34757 Res = DAG.getBitcast(MVT::i64, Res);
34758 Results.push_back(Res);
34759 Results.push_back(Ld.getValue(1));
34760 return;
34761 }
34762 if (Subtarget.hasX87()) {
34763 // First load this into an 80-bit X87 register. This will put the whole
34764 // integer into the significand.
34765 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34766 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34768 dl, Tys, Ops, MVT::i64,
34769 Node->getMemOperand());
34770 SDValue Chain = Result.getValue(1);
34771
34772 // Now store the X87 register to a stack temporary and convert to i64.
34773 // This store is not atomic and doesn't need to be.
34774 // FIXME: We don't need a stack temporary if the result of the load
34775 // is already being stored. We could just directly store there.
34776 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34777 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34778 MachinePointerInfo MPI =
34780 SDValue StoreOps[] = { Chain, Result, StackPtr };
34781 Chain = DAG.getMemIntrinsicNode(
34782 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34783 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34784
34785 // Finally load the value back from the stack temporary and return it.
34786 // This load is not atomic and doesn't need to be.
34787 // This load will be further type legalized.
34788 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34789 Results.push_back(Result);
34790 Results.push_back(Result.getValue(1));
34791 return;
34792 }
34793 }
34794 // TODO: Use MOVLPS when SSE1 is available?
34795 // Delegate to generic TypeLegalization. Situations we can really handle
34796 // should have already been dealt with by AtomicExpandPass.cpp.
34797 break;
34798 }
34799 case ISD::ATOMIC_SWAP:
34800 case ISD::ATOMIC_LOAD_ADD:
34801 case ISD::ATOMIC_LOAD_SUB:
34802 case ISD::ATOMIC_LOAD_AND:
34803 case ISD::ATOMIC_LOAD_OR:
34804 case ISD::ATOMIC_LOAD_XOR:
34805 case ISD::ATOMIC_LOAD_NAND:
34806 case ISD::ATOMIC_LOAD_MIN:
34807 case ISD::ATOMIC_LOAD_MAX:
34808 case ISD::ATOMIC_LOAD_UMIN:
34809 case ISD::ATOMIC_LOAD_UMAX:
34810 // Delegate to generic TypeLegalization. Situations we can really handle
34811 // should have already been dealt with by AtomicExpandPass.cpp.
34812 break;
34813
34814 case ISD::BITCAST: {
34815 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34816 EVT DstVT = N->getValueType(0);
34817 EVT SrcVT = N->getOperand(0).getValueType();
34818
34819 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34820 // we can split using the k-register rather than memory.
34821 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34822 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34823 SDValue Lo, Hi;
34824 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34825 Lo = DAG.getBitcast(MVT::i32, Lo);
34826 Hi = DAG.getBitcast(MVT::i32, Hi);
34827 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34828 Results.push_back(Res);
34829 return;
34830 }
34831
34832 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34833 // FIXME: Use v4f32 for SSE1?
34834 assert(Subtarget.hasSSE2() && "Requires SSE2");
34835 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34836 "Unexpected type action!");
34837 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34838 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34839 N->getOperand(0));
34840 Res = DAG.getBitcast(WideVT, Res);
34841 Results.push_back(Res);
34842 return;
34843 }
34844
34845 return;
34846 }
34847 case ISD::MGATHER: {
34848 EVT VT = N->getValueType(0);
34849 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34850 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34851 auto *Gather = cast<MaskedGatherSDNode>(N);
34852 SDValue Index = Gather->getIndex();
34853 if (Index.getValueType() != MVT::v2i64)
34854 return;
34856 "Unexpected type action!");
34857 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34858 SDValue Mask = Gather->getMask();
34859 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34860 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34861 Gather->getPassThru(),
34862 DAG.getUNDEF(VT));
34863 if (!Subtarget.hasVLX()) {
34864 // We need to widen the mask, but the instruction will only use 2
34865 // of its elements. So we can use undef.
34866 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34867 DAG.getUNDEF(MVT::v2i1));
34868 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34869 }
34870 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34871 Gather->getBasePtr(), Index, Gather->getScale() };
34872 SDValue Res = DAG.getMemIntrinsicNode(
34873 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34874 Gather->getMemoryVT(), Gather->getMemOperand());
34875 Results.push_back(Res);
34876 Results.push_back(Res.getValue(1));
34877 return;
34878 }
34879 return;
34880 }
34881 case ISD::LOAD: {
34882 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34883 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34884 // cast since type legalization will try to use an i64 load.
34885 MVT VT = N->getSimpleValueType(0);
34886 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34888 "Unexpected type action!");
34889 if (!ISD::isNON_EXTLoad(N))
34890 return;
34891 auto *Ld = cast<LoadSDNode>(N);
34892 if (Subtarget.hasSSE2()) {
34893 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34894 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34895 Ld->getPointerInfo(), Ld->getBaseAlign(),
34896 Ld->getMemOperand()->getFlags());
34897 SDValue Chain = Res.getValue(1);
34898 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34899 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34900 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34901 Res = DAG.getBitcast(WideVT, Res);
34902 Results.push_back(Res);
34903 Results.push_back(Chain);
34904 return;
34905 }
34906 assert(Subtarget.hasSSE1() && "Expected SSE");
34907 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34908 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34910 MVT::i64, Ld->getMemOperand());
34911 Results.push_back(Res);
34912 Results.push_back(Res.getValue(1));
34913 return;
34914 }
34915 case ISD::ADDRSPACECAST: {
34916 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34917 Results.push_back(V);
34918 return;
34919 }
34920 case ISD::BITREVERSE: {
34921 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34922 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34923 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34924 // We'll need to move the scalar in two i32 pieces.
34925 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34926 return;
34927 }
34929 // f16 = extract vXf16 %vec, i64 %idx
34930 assert(N->getSimpleValueType(0) == MVT::f16 &&
34931 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34932 assert(Subtarget.hasFP16() && "Expected FP16");
34933 SDValue VecOp = N->getOperand(0);
34935 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34936 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34937 N->getOperand(1));
34938 Split = DAG.getBitcast(MVT::f16, Split);
34939 Results.push_back(Split);
34940 return;
34941 }
34942 }
34943}
34944
34945const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34946 switch ((X86ISD::NodeType)Opcode) {
34947 case X86ISD::FIRST_NUMBER: break;
34948#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34949 NODE_NAME_CASE(BSF)
34950 NODE_NAME_CASE(BSR)
34951 NODE_NAME_CASE(FSHL)
34952 NODE_NAME_CASE(FSHR)
34953 NODE_NAME_CASE(FAND)
34954 NODE_NAME_CASE(FANDN)
34955 NODE_NAME_CASE(FOR)
34956 NODE_NAME_CASE(FXOR)
34957 NODE_NAME_CASE(FILD)
34958 NODE_NAME_CASE(FIST)
34959 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34960 NODE_NAME_CASE(FLD)
34961 NODE_NAME_CASE(FST)
34962 NODE_NAME_CASE(CALL)
34963 NODE_NAME_CASE(CALL_RVMARKER)
34964 NODE_NAME_CASE(IMP_CALL)
34966 NODE_NAME_CASE(CMP)
34967 NODE_NAME_CASE(FCMP)
34968 NODE_NAME_CASE(STRICT_FCMP)
34969 NODE_NAME_CASE(STRICT_FCMPS)
34971 NODE_NAME_CASE(UCOMI)
34972 NODE_NAME_CASE(COMX)
34973 NODE_NAME_CASE(UCOMX)
34974 NODE_NAME_CASE(CMPM)
34975 NODE_NAME_CASE(CMPMM)
34976 NODE_NAME_CASE(STRICT_CMPM)
34977 NODE_NAME_CASE(CMPMM_SAE)
34978 NODE_NAME_CASE(SETCC)
34979 NODE_NAME_CASE(SETCC_CARRY)
34980 NODE_NAME_CASE(FSETCC)
34981 NODE_NAME_CASE(FSETCCM)
34982 NODE_NAME_CASE(FSETCCM_SAE)
34983 NODE_NAME_CASE(CMOV)
34984 NODE_NAME_CASE(BRCOND)
34985 NODE_NAME_CASE(RET_GLUE)
34986 NODE_NAME_CASE(IRET)
34987 NODE_NAME_CASE(REP_STOS)
34988 NODE_NAME_CASE(REP_MOVS)
34989 NODE_NAME_CASE(GlobalBaseReg)
34991 NODE_NAME_CASE(WrapperRIP)
34992 NODE_NAME_CASE(MOVQ2DQ)
34993 NODE_NAME_CASE(MOVDQ2Q)
34994 NODE_NAME_CASE(MMX_MOVD2W)
34995 NODE_NAME_CASE(MMX_MOVW2D)
34996 NODE_NAME_CASE(PEXTRB)
34997 NODE_NAME_CASE(PEXTRW)
34998 NODE_NAME_CASE(INSERTPS)
34999 NODE_NAME_CASE(PINSRB)
35000 NODE_NAME_CASE(PINSRW)
35001 NODE_NAME_CASE(PSHUFB)
35002 NODE_NAME_CASE(ANDNP)
35003 NODE_NAME_CASE(BLENDI)
35005 NODE_NAME_CASE(HADD)
35006 NODE_NAME_CASE(HSUB)
35007 NODE_NAME_CASE(FHADD)
35008 NODE_NAME_CASE(FHSUB)
35009 NODE_NAME_CASE(CONFLICT)
35010 NODE_NAME_CASE(FMAX)
35011 NODE_NAME_CASE(FMAXS)
35012 NODE_NAME_CASE(FMAX_SAE)
35013 NODE_NAME_CASE(FMAXS_SAE)
35014 NODE_NAME_CASE(STRICT_FMAX)
35015 NODE_NAME_CASE(FMIN)
35016 NODE_NAME_CASE(FMINS)
35017 NODE_NAME_CASE(FMIN_SAE)
35018 NODE_NAME_CASE(FMINS_SAE)
35019 NODE_NAME_CASE(STRICT_FMIN)
35020 NODE_NAME_CASE(FMAXC)
35021 NODE_NAME_CASE(FMINC)
35022 NODE_NAME_CASE(FRSQRT)
35023 NODE_NAME_CASE(FRCP)
35024 NODE_NAME_CASE(EXTRQI)
35025 NODE_NAME_CASE(INSERTQI)
35026 NODE_NAME_CASE(TLSADDR)
35027 NODE_NAME_CASE(TLSBASEADDR)
35028 NODE_NAME_CASE(TLSCALL)
35029 NODE_NAME_CASE(TLSDESC)
35030 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35031 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35032 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35033 NODE_NAME_CASE(EH_RETURN)
35034 NODE_NAME_CASE(TC_RETURN)
35035 NODE_NAME_CASE(FNSTCW16m)
35036 NODE_NAME_CASE(FLDCW16m)
35037 NODE_NAME_CASE(FNSTENVm)
35038 NODE_NAME_CASE(FLDENVm)
35039 NODE_NAME_CASE(LCMPXCHG_DAG)
35040 NODE_NAME_CASE(LCMPXCHG8_DAG)
35041 NODE_NAME_CASE(LCMPXCHG16_DAG)
35042 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35043 NODE_NAME_CASE(LADD)
35044 NODE_NAME_CASE(LSUB)
35045 NODE_NAME_CASE(LOR)
35046 NODE_NAME_CASE(LXOR)
35047 NODE_NAME_CASE(LAND)
35048 NODE_NAME_CASE(LBTS)
35049 NODE_NAME_CASE(LBTC)
35050 NODE_NAME_CASE(LBTR)
35051 NODE_NAME_CASE(LBTS_RM)
35052 NODE_NAME_CASE(LBTC_RM)
35053 NODE_NAME_CASE(LBTR_RM)
35054 NODE_NAME_CASE(AADD)
35055 NODE_NAME_CASE(AOR)
35056 NODE_NAME_CASE(AXOR)
35057 NODE_NAME_CASE(AAND)
35058 NODE_NAME_CASE(VZEXT_MOVL)
35059 NODE_NAME_CASE(VZEXT_LOAD)
35060 NODE_NAME_CASE(VEXTRACT_STORE)
35061 NODE_NAME_CASE(VTRUNC)
35062 NODE_NAME_CASE(VTRUNCS)
35063 NODE_NAME_CASE(VTRUNCUS)
35064 NODE_NAME_CASE(VMTRUNC)
35065 NODE_NAME_CASE(VMTRUNCS)
35066 NODE_NAME_CASE(VMTRUNCUS)
35067 NODE_NAME_CASE(VTRUNCSTORES)
35068 NODE_NAME_CASE(VTRUNCSTOREUS)
35069 NODE_NAME_CASE(VMTRUNCSTORES)
35070 NODE_NAME_CASE(VMTRUNCSTOREUS)
35071 NODE_NAME_CASE(VFPEXT)
35072 NODE_NAME_CASE(STRICT_VFPEXT)
35073 NODE_NAME_CASE(VFPEXT_SAE)
35074 NODE_NAME_CASE(VFPEXTS)
35075 NODE_NAME_CASE(VFPEXTS_SAE)
35076 NODE_NAME_CASE(VFPROUND)
35077 NODE_NAME_CASE(VFPROUND2)
35078 NODE_NAME_CASE(VFPROUND2_RND)
35079 NODE_NAME_CASE(STRICT_VFPROUND)
35080 NODE_NAME_CASE(VMFPROUND)
35081 NODE_NAME_CASE(VFPROUND_RND)
35082 NODE_NAME_CASE(VFPROUNDS)
35083 NODE_NAME_CASE(VFPROUNDS_RND)
35084 NODE_NAME_CASE(VSHLDQ)
35085 NODE_NAME_CASE(VSRLDQ)
35086 NODE_NAME_CASE(VSHL)
35087 NODE_NAME_CASE(VSRL)
35088 NODE_NAME_CASE(VSRA)
35089 NODE_NAME_CASE(VSHLI)
35090 NODE_NAME_CASE(VSRLI)
35091 NODE_NAME_CASE(VSRAI)
35092 NODE_NAME_CASE(VSHLV)
35093 NODE_NAME_CASE(VSRLV)
35094 NODE_NAME_CASE(VSRAV)
35095 NODE_NAME_CASE(VROTLI)
35096 NODE_NAME_CASE(VROTRI)
35097 NODE_NAME_CASE(VPPERM)
35098 NODE_NAME_CASE(CMPP)
35099 NODE_NAME_CASE(STRICT_CMPP)
35100 NODE_NAME_CASE(PCMPEQ)
35101 NODE_NAME_CASE(PCMPGT)
35102 NODE_NAME_CASE(PHMINPOS)
35103 NODE_NAME_CASE(ADD)
35104 NODE_NAME_CASE(SUB)
35105 NODE_NAME_CASE(ADC)
35106 NODE_NAME_CASE(SBB)
35107 NODE_NAME_CASE(SMUL)
35108 NODE_NAME_CASE(UMUL)
35109 NODE_NAME_CASE(OR)
35110 NODE_NAME_CASE(XOR)
35111 NODE_NAME_CASE(AND)
35112 NODE_NAME_CASE(BEXTR)
35114 NODE_NAME_CASE(BZHI)
35115 NODE_NAME_CASE(PDEP)
35116 NODE_NAME_CASE(PEXT)
35117 NODE_NAME_CASE(MUL_IMM)
35118 NODE_NAME_CASE(MOVMSK)
35119 NODE_NAME_CASE(PTEST)
35120 NODE_NAME_CASE(TESTP)
35121 NODE_NAME_CASE(KORTEST)
35122 NODE_NAME_CASE(KTEST)
35123 NODE_NAME_CASE(KADD)
35124 NODE_NAME_CASE(KSHIFTL)
35125 NODE_NAME_CASE(KSHIFTR)
35126 NODE_NAME_CASE(PACKSS)
35127 NODE_NAME_CASE(PACKUS)
35128 NODE_NAME_CASE(PALIGNR)
35129 NODE_NAME_CASE(VALIGN)
35130 NODE_NAME_CASE(VSHLD)
35131 NODE_NAME_CASE(VSHRD)
35132 NODE_NAME_CASE(VSHLDV)
35133 NODE_NAME_CASE(VSHRDV)
35134 NODE_NAME_CASE(PSHUFD)
35135 NODE_NAME_CASE(PSHUFHW)
35136 NODE_NAME_CASE(PSHUFLW)
35137 NODE_NAME_CASE(SHUFP)
35138 NODE_NAME_CASE(SHUF128)
35139 NODE_NAME_CASE(MOVLHPS)
35140 NODE_NAME_CASE(MOVHLPS)
35141 NODE_NAME_CASE(MOVDDUP)
35142 NODE_NAME_CASE(MOVSHDUP)
35143 NODE_NAME_CASE(MOVSLDUP)
35144 NODE_NAME_CASE(MOVSD)
35145 NODE_NAME_CASE(MOVSS)
35146 NODE_NAME_CASE(MOVSH)
35147 NODE_NAME_CASE(UNPCKL)
35148 NODE_NAME_CASE(UNPCKH)
35149 NODE_NAME_CASE(VBROADCAST)
35150 NODE_NAME_CASE(VBROADCAST_LOAD)
35151 NODE_NAME_CASE(VBROADCASTM)
35152 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35153 NODE_NAME_CASE(VPERMILPV)
35154 NODE_NAME_CASE(VPERMILPI)
35155 NODE_NAME_CASE(VPERM2X128)
35156 NODE_NAME_CASE(VPERMV)
35157 NODE_NAME_CASE(VPERMV3)
35158 NODE_NAME_CASE(VPERMI)
35159 NODE_NAME_CASE(VPTERNLOG)
35160 NODE_NAME_CASE(FP_TO_SINT_SAT)
35161 NODE_NAME_CASE(FP_TO_UINT_SAT)
35162 NODE_NAME_CASE(VFIXUPIMM)
35163 NODE_NAME_CASE(VFIXUPIMM_SAE)
35164 NODE_NAME_CASE(VFIXUPIMMS)
35165 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35166 NODE_NAME_CASE(VRANGE)
35167 NODE_NAME_CASE(VRANGE_SAE)
35168 NODE_NAME_CASE(VRANGES)
35169 NODE_NAME_CASE(VRANGES_SAE)
35170 NODE_NAME_CASE(PMULUDQ)
35171 NODE_NAME_CASE(PMULDQ)
35172 NODE_NAME_CASE(PSADBW)
35173 NODE_NAME_CASE(DBPSADBW)
35174 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35175 NODE_NAME_CASE(VAARG_64)
35176 NODE_NAME_CASE(VAARG_X32)
35177 NODE_NAME_CASE(DYN_ALLOCA)
35178 NODE_NAME_CASE(MFENCE)
35179 NODE_NAME_CASE(SEG_ALLOCA)
35180 NODE_NAME_CASE(PROBED_ALLOCA)
35183 NODE_NAME_CASE(RDPKRU)
35184 NODE_NAME_CASE(WRPKRU)
35185 NODE_NAME_CASE(VPMADDUBSW)
35186 NODE_NAME_CASE(VPMADDWD)
35187 NODE_NAME_CASE(VPSHA)
35188 NODE_NAME_CASE(VPSHL)
35189 NODE_NAME_CASE(VPCOM)
35190 NODE_NAME_CASE(VPCOMU)
35191 NODE_NAME_CASE(VPERMIL2)
35193 NODE_NAME_CASE(STRICT_FMSUB)
35195 NODE_NAME_CASE(STRICT_FNMADD)
35197 NODE_NAME_CASE(STRICT_FNMSUB)
35198 NODE_NAME_CASE(FMADDSUB)
35199 NODE_NAME_CASE(FMSUBADD)
35200 NODE_NAME_CASE(FMADD_RND)
35201 NODE_NAME_CASE(FNMADD_RND)
35202 NODE_NAME_CASE(FMSUB_RND)
35203 NODE_NAME_CASE(FNMSUB_RND)
35204 NODE_NAME_CASE(FMADDSUB_RND)
35205 NODE_NAME_CASE(FMSUBADD_RND)
35206 NODE_NAME_CASE(VFMADDC)
35207 NODE_NAME_CASE(VFMADDC_RND)
35208 NODE_NAME_CASE(VFCMADDC)
35209 NODE_NAME_CASE(VFCMADDC_RND)
35210 NODE_NAME_CASE(VFMULC)
35211 NODE_NAME_CASE(VFMULC_RND)
35212 NODE_NAME_CASE(VFCMULC)
35213 NODE_NAME_CASE(VFCMULC_RND)
35214 NODE_NAME_CASE(VFMULCSH)
35215 NODE_NAME_CASE(VFMULCSH_RND)
35216 NODE_NAME_CASE(VFCMULCSH)
35217 NODE_NAME_CASE(VFCMULCSH_RND)
35218 NODE_NAME_CASE(VFMADDCSH)
35219 NODE_NAME_CASE(VFMADDCSH_RND)
35220 NODE_NAME_CASE(VFCMADDCSH)
35221 NODE_NAME_CASE(VFCMADDCSH_RND)
35222 NODE_NAME_CASE(VPMADD52H)
35223 NODE_NAME_CASE(VPMADD52L)
35224 NODE_NAME_CASE(VRNDSCALE)
35225 NODE_NAME_CASE(STRICT_VRNDSCALE)
35226 NODE_NAME_CASE(VRNDSCALE_SAE)
35227 NODE_NAME_CASE(VRNDSCALES)
35228 NODE_NAME_CASE(VRNDSCALES_SAE)
35229 NODE_NAME_CASE(VREDUCE)
35230 NODE_NAME_CASE(VREDUCE_SAE)
35231 NODE_NAME_CASE(VREDUCES)
35232 NODE_NAME_CASE(VREDUCES_SAE)
35233 NODE_NAME_CASE(VGETMANT)
35234 NODE_NAME_CASE(VGETMANT_SAE)
35235 NODE_NAME_CASE(VGETMANTS)
35236 NODE_NAME_CASE(VGETMANTS_SAE)
35237 NODE_NAME_CASE(PCMPESTR)
35238 NODE_NAME_CASE(PCMPISTR)
35240 NODE_NAME_CASE(COMPRESS)
35242 NODE_NAME_CASE(SELECTS)
35243 NODE_NAME_CASE(ADDSUB)
35244 NODE_NAME_CASE(RCP14)
35245 NODE_NAME_CASE(RCP14S)
35246 NODE_NAME_CASE(RSQRT14)
35247 NODE_NAME_CASE(RSQRT14S)
35248 NODE_NAME_CASE(FADD_RND)
35249 NODE_NAME_CASE(FADDS)
35250 NODE_NAME_CASE(FADDS_RND)
35251 NODE_NAME_CASE(FSUB_RND)
35252 NODE_NAME_CASE(FSUBS)
35253 NODE_NAME_CASE(FSUBS_RND)
35254 NODE_NAME_CASE(FMUL_RND)
35255 NODE_NAME_CASE(FMULS)
35256 NODE_NAME_CASE(FMULS_RND)
35257 NODE_NAME_CASE(FDIV_RND)
35258 NODE_NAME_CASE(FDIVS)
35259 NODE_NAME_CASE(FDIVS_RND)
35260 NODE_NAME_CASE(FSQRT_RND)
35261 NODE_NAME_CASE(FSQRTS)
35262 NODE_NAME_CASE(FSQRTS_RND)
35263 NODE_NAME_CASE(FGETEXP)
35264 NODE_NAME_CASE(FGETEXP_SAE)
35265 NODE_NAME_CASE(FGETEXPS)
35266 NODE_NAME_CASE(FGETEXPS_SAE)
35267 NODE_NAME_CASE(SCALEF)
35268 NODE_NAME_CASE(SCALEF_RND)
35269 NODE_NAME_CASE(SCALEFS)
35270 NODE_NAME_CASE(SCALEFS_RND)
35271 NODE_NAME_CASE(MULHRS)
35272 NODE_NAME_CASE(SINT_TO_FP_RND)
35273 NODE_NAME_CASE(UINT_TO_FP_RND)
35274 NODE_NAME_CASE(CVTTP2SI)
35275 NODE_NAME_CASE(CVTTP2UI)
35276 NODE_NAME_CASE(STRICT_CVTTP2SI)
35277 NODE_NAME_CASE(STRICT_CVTTP2UI)
35278 NODE_NAME_CASE(MCVTTP2SI)
35279 NODE_NAME_CASE(MCVTTP2UI)
35280 NODE_NAME_CASE(CVTTP2SI_SAE)
35281 NODE_NAME_CASE(CVTTP2UI_SAE)
35282 NODE_NAME_CASE(CVTTS2SI)
35283 NODE_NAME_CASE(CVTTS2UI)
35284 NODE_NAME_CASE(CVTTS2SI_SAE)
35285 NODE_NAME_CASE(CVTTS2UI_SAE)
35286 NODE_NAME_CASE(CVTSI2P)
35287 NODE_NAME_CASE(CVTUI2P)
35288 NODE_NAME_CASE(STRICT_CVTSI2P)
35289 NODE_NAME_CASE(STRICT_CVTUI2P)
35290 NODE_NAME_CASE(MCVTSI2P)
35291 NODE_NAME_CASE(MCVTUI2P)
35292 NODE_NAME_CASE(VFPCLASS)
35293 NODE_NAME_CASE(VFPCLASSS)
35294 NODE_NAME_CASE(MULTISHIFT)
35295 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35296 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35297 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35298 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35299 NODE_NAME_CASE(CVTPS2PH)
35300 NODE_NAME_CASE(STRICT_CVTPS2PH)
35301 NODE_NAME_CASE(CVTPS2PH_SAE)
35302 NODE_NAME_CASE(MCVTPS2PH)
35303 NODE_NAME_CASE(MCVTPS2PH_SAE)
35304 NODE_NAME_CASE(CVTPH2PS)
35305 NODE_NAME_CASE(STRICT_CVTPH2PS)
35306 NODE_NAME_CASE(CVTPH2PS_SAE)
35307 NODE_NAME_CASE(CVTP2SI)
35308 NODE_NAME_CASE(CVTP2UI)
35309 NODE_NAME_CASE(MCVTP2SI)
35310 NODE_NAME_CASE(MCVTP2UI)
35311 NODE_NAME_CASE(CVTP2SI_RND)
35312 NODE_NAME_CASE(CVTP2UI_RND)
35313 NODE_NAME_CASE(CVTS2SI)
35314 NODE_NAME_CASE(CVTS2UI)
35315 NODE_NAME_CASE(CVTS2SI_RND)
35316 NODE_NAME_CASE(CVTS2UI_RND)
35317 NODE_NAME_CASE(CVTNEPS2BF16)
35318 NODE_NAME_CASE(MCVTNEPS2BF16)
35319 NODE_NAME_CASE(DPBF16PS)
35320 NODE_NAME_CASE(DPFP16PS)
35321 NODE_NAME_CASE(MPSADBW)
35322 NODE_NAME_CASE(LWPINS)
35323 NODE_NAME_CASE(MGATHER)
35324 NODE_NAME_CASE(MSCATTER)
35325 NODE_NAME_CASE(VPDPBUSD)
35326 NODE_NAME_CASE(VPDPBUSDS)
35327 NODE_NAME_CASE(VPDPWSSD)
35328 NODE_NAME_CASE(VPDPWSSDS)
35329 NODE_NAME_CASE(VPSHUFBITQMB)
35330 NODE_NAME_CASE(GF2P8MULB)
35331 NODE_NAME_CASE(GF2P8AFFINEQB)
35332 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35333 NODE_NAME_CASE(NT_CALL)
35334 NODE_NAME_CASE(NT_BRIND)
35335 NODE_NAME_CASE(UMWAIT)
35336 NODE_NAME_CASE(TPAUSE)
35337 NODE_NAME_CASE(ENQCMD)
35338 NODE_NAME_CASE(ENQCMDS)
35339 NODE_NAME_CASE(VP2INTERSECT)
35340 NODE_NAME_CASE(VPDPBSUD)
35341 NODE_NAME_CASE(VPDPBSUDS)
35342 NODE_NAME_CASE(VPDPBUUD)
35343 NODE_NAME_CASE(VPDPBUUDS)
35344 NODE_NAME_CASE(VPDPBSSD)
35345 NODE_NAME_CASE(VPDPBSSDS)
35346 NODE_NAME_CASE(VPDPWSUD)
35347 NODE_NAME_CASE(VPDPWSUDS)
35348 NODE_NAME_CASE(VPDPWUSD)
35349 NODE_NAME_CASE(VPDPWUSDS)
35350 NODE_NAME_CASE(VPDPWUUD)
35351 NODE_NAME_CASE(VPDPWUUDS)
35352 NODE_NAME_CASE(VMINMAX)
35353 NODE_NAME_CASE(VMINMAX_SAE)
35354 NODE_NAME_CASE(VMINMAXS)
35355 NODE_NAME_CASE(VMINMAXS_SAE)
35356 NODE_NAME_CASE(CVTP2IBS)
35357 NODE_NAME_CASE(CVTP2IUBS)
35358 NODE_NAME_CASE(CVTP2IBS_RND)
35359 NODE_NAME_CASE(CVTP2IUBS_RND)
35360 NODE_NAME_CASE(CVTTP2IBS)
35361 NODE_NAME_CASE(CVTTP2IUBS)
35362 NODE_NAME_CASE(CVTTP2IBS_SAE)
35363 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35364 NODE_NAME_CASE(VCVT2PH2BF8)
35365 NODE_NAME_CASE(VCVT2PH2BF8S)
35366 NODE_NAME_CASE(VCVT2PH2HF8)
35367 NODE_NAME_CASE(VCVT2PH2HF8S)
35368 NODE_NAME_CASE(VCVTBIASPH2BF8)
35369 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35370 NODE_NAME_CASE(VCVTBIASPH2HF8)
35371 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35372 NODE_NAME_CASE(VCVTPH2BF8)
35373 NODE_NAME_CASE(VCVTPH2BF8S)
35374 NODE_NAME_CASE(VCVTPH2HF8)
35375 NODE_NAME_CASE(VCVTPH2HF8S)
35376 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35377 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35378 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35379 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35380 NODE_NAME_CASE(VMCVTPH2BF8)
35381 NODE_NAME_CASE(VMCVTPH2BF8S)
35382 NODE_NAME_CASE(VMCVTPH2HF8)
35383 NODE_NAME_CASE(VMCVTPH2HF8S)
35384 NODE_NAME_CASE(VCVTHF82PH)
35385 NODE_NAME_CASE(AESENC128KL)
35386 NODE_NAME_CASE(AESDEC128KL)
35387 NODE_NAME_CASE(AESENC256KL)
35388 NODE_NAME_CASE(AESDEC256KL)
35389 NODE_NAME_CASE(AESENCWIDE128KL)
35390 NODE_NAME_CASE(AESDECWIDE128KL)
35391 NODE_NAME_CASE(AESENCWIDE256KL)
35392 NODE_NAME_CASE(AESDECWIDE256KL)
35393 NODE_NAME_CASE(CMPCCXADD)
35394 NODE_NAME_CASE(TESTUI)
35395 NODE_NAME_CASE(FP80_ADD)
35396 NODE_NAME_CASE(STRICT_FP80_ADD)
35397 NODE_NAME_CASE(CCMP)
35398 NODE_NAME_CASE(CTEST)
35399 NODE_NAME_CASE(CLOAD)
35400 NODE_NAME_CASE(CSTORE)
35401 NODE_NAME_CASE(CVTTS2SIS)
35402 NODE_NAME_CASE(CVTTS2UIS)
35403 NODE_NAME_CASE(CVTTS2SIS_SAE)
35404 NODE_NAME_CASE(CVTTS2UIS_SAE)
35405 NODE_NAME_CASE(CVTTP2SIS)
35406 NODE_NAME_CASE(MCVTTP2SIS)
35407 NODE_NAME_CASE(CVTTP2UIS_SAE)
35408 NODE_NAME_CASE(CVTTP2SIS_SAE)
35409 NODE_NAME_CASE(CVTTP2UIS)
35410 NODE_NAME_CASE(MCVTTP2UIS)
35411 NODE_NAME_CASE(POP_FROM_X87_REG)
35412 }
35413 return nullptr;
35414#undef NODE_NAME_CASE
35415}
35416
35417/// Return true if the addressing mode represented by AM is legal for this
35418/// target, for a load/store of the specified type.
35420 const AddrMode &AM, Type *Ty,
35421 unsigned AS,
35422 Instruction *I) const {
35423 // X86 supports extremely general addressing modes.
35425
35426 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35427 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35428 return false;
35429
35430 if (AM.BaseGV) {
35431 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35432
35433 // If a reference to this global requires an extra load, we can't fold it.
35434 if (isGlobalStubReference(GVFlags))
35435 return false;
35436
35437 // If BaseGV requires a register for the PIC base, we cannot also have a
35438 // BaseReg specified.
35439 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35440 return false;
35441
35442 // If lower 4G is not available, then we must use rip-relative addressing.
35443 if ((M != CodeModel::Small || isPositionIndependent()) &&
35444 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35445 return false;
35446 }
35447
35448 switch (AM.Scale) {
35449 case 0:
35450 case 1:
35451 case 2:
35452 case 4:
35453 case 8:
35454 // These scales always work.
35455 break;
35456 case 3:
35457 case 5:
35458 case 9:
35459 // These scales are formed with basereg+scalereg. Only accept if there is
35460 // no basereg yet.
35461 if (AM.HasBaseReg)
35462 return false;
35463 break;
35464 default: // Other stuff never works.
35465 return false;
35466 }
35467
35468 return true;
35469}
35470
35471bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35472 switch (Opcode) {
35473 // These are non-commutative binops.
35474 // TODO: Add more X86ISD opcodes once we have test coverage.
35475 case X86ISD::ANDNP:
35476 case X86ISD::PCMPGT:
35477 case X86ISD::FMAX:
35478 case X86ISD::FMIN:
35479 case X86ISD::FANDN:
35480 case X86ISD::VPSHA:
35481 case X86ISD::VPSHL:
35482 case X86ISD::VSHLV:
35483 case X86ISD::VSRLV:
35484 case X86ISD::VSRAV:
35485 return true;
35486 }
35487
35488 return TargetLoweringBase::isBinOp(Opcode);
35489}
35490
35491bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35492 switch (Opcode) {
35493 // TODO: Add more X86ISD opcodes once we have test coverage.
35494 case X86ISD::PCMPEQ:
35495 case X86ISD::PMULDQ:
35496 case X86ISD::PMULUDQ:
35497 case X86ISD::FMAXC:
35498 case X86ISD::FMINC:
35499 case X86ISD::FAND:
35500 case X86ISD::FOR:
35501 case X86ISD::FXOR:
35502 return true;
35503 }
35504
35506}
35507
35509 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35510 return false;
35511 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35512 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35513 return NumBits1 > NumBits2;
35514}
35515
35517 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35518 return false;
35519
35520 if (!isTypeLegal(EVT::getEVT(Ty1)))
35521 return false;
35522
35523 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35524
35525 // Assuming the caller doesn't have a zeroext or signext return parameter,
35526 // truncation all the way down to i1 is valid.
35527 return true;
35528}
35529
35531 return isInt<32>(Imm);
35532}
35533
35535 // Can also use sub to handle negated immediates.
35536 return isInt<32>(Imm);
35537}
35538
35540 return isInt<32>(Imm);
35541}
35542
35544 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35545 return false;
35546 unsigned NumBits1 = VT1.getSizeInBits();
35547 unsigned NumBits2 = VT2.getSizeInBits();
35548 return NumBits1 > NumBits2;
35549}
35550
35552 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35553 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35554}
35555
35557 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35558 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35559}
35560
35562 EVT VT1 = Val.getValueType();
35563 if (isZExtFree(VT1, VT2))
35564 return true;
35565
35566 if (Val.getOpcode() != ISD::LOAD)
35567 return false;
35568
35569 if (!VT1.isSimple() || !VT1.isInteger() ||
35570 !VT2.isSimple() || !VT2.isInteger())
35571 return false;
35572
35573 switch (VT1.getSimpleVT().SimpleTy) {
35574 default: break;
35575 case MVT::i8:
35576 case MVT::i16:
35577 case MVT::i32:
35578 // X86 has 8, 16, and 32-bit zero-extending loads.
35579 return true;
35580 }
35581
35582 return false;
35583}
35584
35586 if (!Subtarget.is64Bit())
35587 return false;
35588 return TargetLowering::shouldConvertPhiType(From, To);
35589}
35590
35592 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35593 return false;
35594
35595 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35596
35597 // There is no extending load for vXi1.
35598 if (SrcVT.getScalarType() == MVT::i1)
35599 return false;
35600
35601 return true;
35602}
35603
35605 EVT VT) const {
35606 if (Subtarget.useSoftFloat())
35607 return false;
35608
35609 if (!Subtarget.hasAnyFMA())
35610 return false;
35611
35612 VT = VT.getScalarType();
35613
35614 if (!VT.isSimple())
35615 return false;
35616
35617 switch (VT.getSimpleVT().SimpleTy) {
35618 case MVT::f16:
35619 return Subtarget.hasFP16();
35620 case MVT::f32:
35621 case MVT::f64:
35622 return true;
35623 default:
35624 break;
35625 }
35626
35627 return false;
35628}
35629
35631 EVT DestVT) const {
35632 // i16 instructions are longer (0x66 prefix) and potentially slower.
35633 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35634}
35635
35637 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35638 SDValue Y) const {
35639 if (SelectOpcode == ISD::SELECT) {
35640 if (VT.isVector())
35641 return false;
35642 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35643 return false;
35644 using namespace llvm::SDPatternMatch;
35645 // BLSI
35646 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35648 return true;
35649 // BLSR
35650 if (BinOpcode == ISD::AND &&
35653 return true;
35654 // BLSMSK
35655 if (BinOpcode == ISD::XOR &&
35658 return true;
35659
35660 return false;
35661 }
35662 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35663 // benefit. The transform may also be profitable for scalar code.
35664 if (!Subtarget.hasAVX512())
35665 return false;
35666 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35667 return false;
35668 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35669 return false;
35670
35671 return true;
35672}
35673
35674/// Targets can use this to indicate that they only support *some*
35675/// VECTOR_SHUFFLE operations, those with specific masks.
35676/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35677/// are assumed to be legal.
35679 if (!VT.isSimple())
35680 return false;
35681
35682 // Not for i1 vectors
35683 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35684 return false;
35685
35686 // Very little shuffling can be done for 64-bit vectors right now.
35687 if (VT.getSimpleVT().getSizeInBits() == 64)
35688 return false;
35689
35690 // We only care that the types being shuffled are legal. The lowering can
35691 // handle any possible shuffle mask that results.
35692 return isTypeLegal(VT.getSimpleVT());
35693}
35694
35696 EVT VT) const {
35697 // Don't convert an 'and' into a shuffle that we don't directly support.
35698 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35699 if (!Subtarget.hasAVX2())
35700 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35701 return false;
35702
35703 // Just delegate to the generic legality, clear masks aren't special.
35704 return isShuffleMaskLegal(Mask, VT);
35705}
35706
35708 // If the subtarget is using thunks, we need to not generate jump tables.
35709 if (Subtarget.useIndirectThunkBranches())
35710 return false;
35711
35712 // Otherwise, fallback on the generic logic.
35714}
35715
35717 EVT ConditionVT) const {
35718 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35719 // zero-extensions.
35720 if (ConditionVT.getSizeInBits() < 32)
35721 return MVT::i32;
35723 ConditionVT);
35724}
35725
35726//===----------------------------------------------------------------------===//
35727// X86 Scheduler Hooks
35728//===----------------------------------------------------------------------===//
35729
35730/// Utility function to emit xbegin specifying the start of an RTM region.
35732 const TargetInstrInfo *TII) {
35733 const MIMetadata MIMD(MI);
35734
35735 const BasicBlock *BB = MBB->getBasicBlock();
35736 MachineFunction::iterator I = ++MBB->getIterator();
35737
35738 // For the v = xbegin(), we generate
35739 //
35740 // thisMBB:
35741 // xbegin sinkMBB
35742 //
35743 // mainMBB:
35744 // s0 = -1
35745 //
35746 // fallBB:
35747 // eax = # XABORT_DEF
35748 // s1 = eax
35749 //
35750 // sinkMBB:
35751 // v = phi(s0/mainBB, s1/fallBB)
35752
35753 MachineBasicBlock *thisMBB = MBB;
35754 MachineFunction *MF = MBB->getParent();
35755 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35756 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35757 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35758 MF->insert(I, mainMBB);
35759 MF->insert(I, fallMBB);
35760 MF->insert(I, sinkMBB);
35761
35762 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35763 mainMBB->addLiveIn(X86::EFLAGS);
35764 fallMBB->addLiveIn(X86::EFLAGS);
35765 sinkMBB->addLiveIn(X86::EFLAGS);
35766 }
35767
35768 // Transfer the remainder of BB and its successor edges to sinkMBB.
35769 sinkMBB->splice(sinkMBB->begin(), MBB,
35770 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35772
35774 Register DstReg = MI.getOperand(0).getReg();
35775 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35776 Register mainDstReg = MRI.createVirtualRegister(RC);
35777 Register fallDstReg = MRI.createVirtualRegister(RC);
35778
35779 // thisMBB:
35780 // xbegin fallMBB
35781 // # fallthrough to mainMBB
35782 // # abortion to fallMBB
35783 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35784 thisMBB->addSuccessor(mainMBB);
35785 thisMBB->addSuccessor(fallMBB);
35786
35787 // mainMBB:
35788 // mainDstReg := -1
35789 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35790 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35791 mainMBB->addSuccessor(sinkMBB);
35792
35793 // fallMBB:
35794 // ; pseudo instruction to model hardware's definition from XABORT
35795 // EAX := XABORT_DEF
35796 // fallDstReg := EAX
35797 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35798 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35799 .addReg(X86::EAX);
35800 fallMBB->addSuccessor(sinkMBB);
35801
35802 // sinkMBB:
35803 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35804 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35805 .addReg(mainDstReg).addMBB(mainMBB)
35806 .addReg(fallDstReg).addMBB(fallMBB);
35807
35808 MI.eraseFromParent();
35809 return sinkMBB;
35810}
35811
35813X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35814 MachineBasicBlock *MBB) const {
35815 // Emit va_arg instruction on X86-64.
35816
35817 // Operands to this pseudo-instruction:
35818 // 0 ) Output : destination address (reg)
35819 // 1-5) Input : va_list address (addr, i64mem)
35820 // 6 ) ArgSize : Size (in bytes) of vararg type
35821 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35822 // 8 ) Align : Alignment of type
35823 // 9 ) EFLAGS (implicit-def)
35824
35825 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35826 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35827
35828 Register DestReg = MI.getOperand(0).getReg();
35829 MachineOperand &Base = MI.getOperand(1);
35830 MachineOperand &Scale = MI.getOperand(2);
35831 MachineOperand &Index = MI.getOperand(3);
35832 MachineOperand &Disp = MI.getOperand(4);
35833 MachineOperand &Segment = MI.getOperand(5);
35834 unsigned ArgSize = MI.getOperand(6).getImm();
35835 unsigned ArgMode = MI.getOperand(7).getImm();
35836 Align Alignment = Align(MI.getOperand(8).getImm());
35837
35838 MachineFunction *MF = MBB->getParent();
35839
35840 // Memory Reference
35841 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35842
35843 MachineMemOperand *OldMMO = MI.memoperands().front();
35844
35845 // Clone the MMO into two separate MMOs for loading and storing
35846 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35847 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35848 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35849 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35850
35851 // Machine Information
35852 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35853 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35854 const TargetRegisterClass *AddrRegClass =
35856 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35857 const MIMetadata MIMD(MI);
35858
35859 // struct va_list {
35860 // i32 gp_offset
35861 // i32 fp_offset
35862 // i64 overflow_area (address)
35863 // i64 reg_save_area (address)
35864 // }
35865 // sizeof(va_list) = 24
35866 // alignment(va_list) = 8
35867
35868 unsigned TotalNumIntRegs = 6;
35869 unsigned TotalNumXMMRegs = 8;
35870 bool UseGPOffset = (ArgMode == 1);
35871 bool UseFPOffset = (ArgMode == 2);
35872 unsigned MaxOffset = TotalNumIntRegs * 8 +
35873 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35874
35875 /* Align ArgSize to a multiple of 8 */
35876 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35877 bool NeedsAlign = (Alignment > 8);
35878
35879 MachineBasicBlock *thisMBB = MBB;
35880 MachineBasicBlock *overflowMBB;
35881 MachineBasicBlock *offsetMBB;
35882 MachineBasicBlock *endMBB;
35883
35884 Register OffsetDestReg; // Argument address computed by offsetMBB
35885 Register OverflowDestReg; // Argument address computed by overflowMBB
35886 Register OffsetReg;
35887
35888 if (!UseGPOffset && !UseFPOffset) {
35889 // If we only pull from the overflow region, we don't create a branch.
35890 // We don't need to alter control flow.
35891 OffsetDestReg = Register(); // unused
35892 OverflowDestReg = DestReg;
35893
35894 offsetMBB = nullptr;
35895 overflowMBB = thisMBB;
35896 endMBB = thisMBB;
35897 } else {
35898 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35899 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35900 // If not, pull from overflow_area. (branch to overflowMBB)
35901 //
35902 // thisMBB
35903 // | .
35904 // | .
35905 // offsetMBB overflowMBB
35906 // | .
35907 // | .
35908 // endMBB
35909
35910 // Registers for the PHI in endMBB
35911 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35912 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35913
35914 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35915 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35916 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35917 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35918
35920
35921 // Insert the new basic blocks
35922 MF->insert(MBBIter, offsetMBB);
35923 MF->insert(MBBIter, overflowMBB);
35924 MF->insert(MBBIter, endMBB);
35925
35926 // Transfer the remainder of MBB and its successor edges to endMBB.
35927 endMBB->splice(endMBB->begin(), thisMBB,
35928 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35929 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35930
35931 // Make offsetMBB and overflowMBB successors of thisMBB
35932 thisMBB->addSuccessor(offsetMBB);
35933 thisMBB->addSuccessor(overflowMBB);
35934
35935 // endMBB is a successor of both offsetMBB and overflowMBB
35936 offsetMBB->addSuccessor(endMBB);
35937 overflowMBB->addSuccessor(endMBB);
35938
35939 // Load the offset value into a register
35940 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35941 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35942 .add(Base)
35943 .add(Scale)
35944 .add(Index)
35945 .addDisp(Disp, UseFPOffset ? 4 : 0)
35946 .add(Segment)
35947 .setMemRefs(LoadOnlyMMO);
35948
35949 // Check if there is enough room left to pull this argument.
35950 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35951 .addReg(OffsetReg)
35952 .addImm(MaxOffset + 8 - ArgSizeA8);
35953
35954 // Branch to "overflowMBB" if offset >= max
35955 // Fall through to "offsetMBB" otherwise
35956 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35957 .addMBB(overflowMBB).addImm(X86::COND_AE);
35958 }
35959
35960 // In offsetMBB, emit code to use the reg_save_area.
35961 if (offsetMBB) {
35962 assert(OffsetReg != 0);
35963
35964 // Read the reg_save_area address.
35965 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35966 BuildMI(
35967 offsetMBB, MIMD,
35968 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35969 RegSaveReg)
35970 .add(Base)
35971 .add(Scale)
35972 .add(Index)
35973 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35974 .add(Segment)
35975 .setMemRefs(LoadOnlyMMO);
35976
35977 if (Subtarget.isTarget64BitLP64()) {
35978 // Zero-extend the offset
35979 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35980 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35981 .addImm(0)
35982 .addReg(OffsetReg)
35983 .addImm(X86::sub_32bit);
35984
35985 // Add the offset to the reg_save_area to get the final address.
35986 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35987 .addReg(OffsetReg64)
35988 .addReg(RegSaveReg);
35989 } else {
35990 // Add the offset to the reg_save_area to get the final address.
35991 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35992 .addReg(OffsetReg)
35993 .addReg(RegSaveReg);
35994 }
35995
35996 // Compute the offset for the next argument
35997 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35998 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35999 .addReg(OffsetReg)
36000 .addImm(UseFPOffset ? 16 : 8);
36001
36002 // Store it back into the va_list.
36003 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
36004 .add(Base)
36005 .add(Scale)
36006 .add(Index)
36007 .addDisp(Disp, UseFPOffset ? 4 : 0)
36008 .add(Segment)
36009 .addReg(NextOffsetReg)
36010 .setMemRefs(StoreOnlyMMO);
36011
36012 // Jump to endMBB
36013 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
36014 .addMBB(endMBB);
36015 }
36016
36017 //
36018 // Emit code to use overflow area
36019 //
36020
36021 // Load the overflow_area address into a register.
36022 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36023 BuildMI(overflowMBB, MIMD,
36024 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36025 OverflowAddrReg)
36026 .add(Base)
36027 .add(Scale)
36028 .add(Index)
36029 .addDisp(Disp, 8)
36030 .add(Segment)
36031 .setMemRefs(LoadOnlyMMO);
36032
36033 // If we need to align it, do so. Otherwise, just copy the address
36034 // to OverflowDestReg.
36035 if (NeedsAlign) {
36036 // Align the overflow address
36037 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36038
36039 // aligned_addr = (addr + (align-1)) & ~(align-1)
36040 BuildMI(
36041 overflowMBB, MIMD,
36042 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36043 TmpReg)
36044 .addReg(OverflowAddrReg)
36045 .addImm(Alignment.value() - 1);
36046
36047 BuildMI(
36048 overflowMBB, MIMD,
36049 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36050 OverflowDestReg)
36051 .addReg(TmpReg)
36052 .addImm(~(uint64_t)(Alignment.value() - 1));
36053 } else {
36054 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36055 .addReg(OverflowAddrReg);
36056 }
36057
36058 // Compute the next overflow address after this argument.
36059 // (the overflow address should be kept 8-byte aligned)
36060 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36061 BuildMI(
36062 overflowMBB, MIMD,
36063 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36064 NextAddrReg)
36065 .addReg(OverflowDestReg)
36066 .addImm(ArgSizeA8);
36067
36068 // Store the new overflow address.
36069 BuildMI(overflowMBB, MIMD,
36070 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36071 .add(Base)
36072 .add(Scale)
36073 .add(Index)
36074 .addDisp(Disp, 8)
36075 .add(Segment)
36076 .addReg(NextAddrReg)
36077 .setMemRefs(StoreOnlyMMO);
36078
36079 // If we branched, emit the PHI to the front of endMBB.
36080 if (offsetMBB) {
36081 BuildMI(*endMBB, endMBB->begin(), MIMD,
36082 TII->get(X86::PHI), DestReg)
36083 .addReg(OffsetDestReg).addMBB(offsetMBB)
36084 .addReg(OverflowDestReg).addMBB(overflowMBB);
36085 }
36086
36087 // Erase the pseudo instruction
36088 MI.eraseFromParent();
36089
36090 return endMBB;
36091}
36092
36093// The EFLAGS operand of SelectItr might be missing a kill marker
36094// because there were multiple uses of EFLAGS, and ISel didn't know
36095// which to mark. Figure out whether SelectItr should have had a
36096// kill marker, and set it if it should. Returns the correct kill
36097// marker value.
36100 const TargetRegisterInfo* TRI) {
36101 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36102 return false;
36103
36104 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36105 // out. SelectMI should have a kill flag on EFLAGS.
36106 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36107 return true;
36108}
36109
36110// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36111// together with other CMOV pseudo-opcodes into a single basic-block with
36112// conditional jump around it.
36114 switch (MI.getOpcode()) {
36115 case X86::CMOV_FR16:
36116 case X86::CMOV_FR16X:
36117 case X86::CMOV_FR32:
36118 case X86::CMOV_FR32X:
36119 case X86::CMOV_FR64:
36120 case X86::CMOV_FR64X:
36121 case X86::CMOV_GR8:
36122 case X86::CMOV_GR16:
36123 case X86::CMOV_GR32:
36124 case X86::CMOV_RFP32:
36125 case X86::CMOV_RFP64:
36126 case X86::CMOV_RFP80:
36127 case X86::CMOV_VR64:
36128 case X86::CMOV_VR128:
36129 case X86::CMOV_VR128X:
36130 case X86::CMOV_VR256:
36131 case X86::CMOV_VR256X:
36132 case X86::CMOV_VR512:
36133 case X86::CMOV_VK1:
36134 case X86::CMOV_VK2:
36135 case X86::CMOV_VK4:
36136 case X86::CMOV_VK8:
36137 case X86::CMOV_VK16:
36138 case X86::CMOV_VK32:
36139 case X86::CMOV_VK64:
36140 return true;
36141
36142 default:
36143 return false;
36144 }
36145}
36146
36147// Helper function, which inserts PHI functions into SinkMBB:
36148// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36149// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36150// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36151// the last PHI function inserted.
36154 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36155 MachineBasicBlock *SinkMBB) {
36156 MachineFunction *MF = TrueMBB->getParent();
36158 const MIMetadata MIMD(*MIItBegin);
36159
36160 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36162
36163 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36164
36165 // As we are creating the PHIs, we have to be careful if there is more than
36166 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36167 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36168 // That also means that PHI construction must work forward from earlier to
36169 // later, and that the code must maintain a mapping from earlier PHI's
36170 // destination registers, and the registers that went into the PHI.
36173
36174 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36175 Register DestReg = MIIt->getOperand(0).getReg();
36176 Register Op1Reg = MIIt->getOperand(1).getReg();
36177 Register Op2Reg = MIIt->getOperand(2).getReg();
36178
36179 // If this CMOV we are generating is the opposite condition from
36180 // the jump we generated, then we have to swap the operands for the
36181 // PHI that is going to be generated.
36182 if (MIIt->getOperand(3).getImm() == OppCC)
36183 std::swap(Op1Reg, Op2Reg);
36184
36185 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36186 Op1Reg = It->second.first;
36187
36188 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36189 Op2Reg = It->second.second;
36190
36191 MIB =
36192 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36193 .addReg(Op1Reg)
36194 .addMBB(FalseMBB)
36195 .addReg(Op2Reg)
36196 .addMBB(TrueMBB);
36197
36198 // Add this PHI to the rewrite table.
36199 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36200 }
36201
36202 return MIB;
36203}
36204
36205// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36207X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36208 MachineInstr &SecondCascadedCMOV,
36209 MachineBasicBlock *ThisMBB) const {
36210 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36211 const MIMetadata MIMD(FirstCMOV);
36212
36213 // We lower cascaded CMOVs such as
36214 //
36215 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36216 //
36217 // to two successive branches.
36218 //
36219 // Without this, we would add a PHI between the two jumps, which ends up
36220 // creating a few copies all around. For instance, for
36221 //
36222 // (sitofp (zext (fcmp une)))
36223 //
36224 // we would generate:
36225 //
36226 // ucomiss %xmm1, %xmm0
36227 // movss <1.0f>, %xmm0
36228 // movaps %xmm0, %xmm1
36229 // jne .LBB5_2
36230 // xorps %xmm1, %xmm1
36231 // .LBB5_2:
36232 // jp .LBB5_4
36233 // movaps %xmm1, %xmm0
36234 // .LBB5_4:
36235 // retq
36236 //
36237 // because this custom-inserter would have generated:
36238 //
36239 // A
36240 // | \
36241 // | B
36242 // | /
36243 // C
36244 // | \
36245 // | D
36246 // | /
36247 // E
36248 //
36249 // A: X = ...; Y = ...
36250 // B: empty
36251 // C: Z = PHI [X, A], [Y, B]
36252 // D: empty
36253 // E: PHI [X, C], [Z, D]
36254 //
36255 // If we lower both CMOVs in a single step, we can instead generate:
36256 //
36257 // A
36258 // | \
36259 // | C
36260 // | /|
36261 // |/ |
36262 // | |
36263 // | D
36264 // | /
36265 // E
36266 //
36267 // A: X = ...; Y = ...
36268 // D: empty
36269 // E: PHI [X, A], [X, C], [Y, D]
36270 //
36271 // Which, in our sitofp/fcmp example, gives us something like:
36272 //
36273 // ucomiss %xmm1, %xmm0
36274 // movss <1.0f>, %xmm0
36275 // jne .LBB5_4
36276 // jp .LBB5_4
36277 // xorps %xmm0, %xmm0
36278 // .LBB5_4:
36279 // retq
36280 //
36281
36282 // We lower cascaded CMOV into two successive branches to the same block.
36283 // EFLAGS is used by both, so mark it as live in the second.
36284 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36285 MachineFunction *F = ThisMBB->getParent();
36286 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36287 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36288 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36289
36290 MachineFunction::iterator It = ++ThisMBB->getIterator();
36291 F->insert(It, FirstInsertedMBB);
36292 F->insert(It, SecondInsertedMBB);
36293 F->insert(It, SinkMBB);
36294
36295 // For a cascaded CMOV, we lower it to two successive branches to
36296 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36297 // the FirstInsertedMBB.
36298 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36299
36300 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36301 // live into the sink and copy blocks.
36302 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36303 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36304 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36305 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36306 SinkMBB->addLiveIn(X86::EFLAGS);
36307 }
36308
36309 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36310 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36311 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36312 ThisMBB->end());
36313 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36314
36315 // Fallthrough block for ThisMBB.
36316 ThisMBB->addSuccessor(FirstInsertedMBB);
36317 // The true block target of the first branch is always SinkMBB.
36318 ThisMBB->addSuccessor(SinkMBB);
36319 // Fallthrough block for FirstInsertedMBB.
36320 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36321 // The true block for the branch of FirstInsertedMBB.
36322 FirstInsertedMBB->addSuccessor(SinkMBB);
36323 // This is fallthrough.
36324 SecondInsertedMBB->addSuccessor(SinkMBB);
36325
36326 // Create the conditional branch instructions.
36327 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36328 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36329
36330 X86::CondCode SecondCC =
36331 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36332 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36333 .addMBB(SinkMBB)
36334 .addImm(SecondCC);
36335
36336 // SinkMBB:
36337 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36338 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36339 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36340 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36341 MachineInstrBuilder MIB =
36342 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36343 .addReg(Op1Reg)
36344 .addMBB(SecondInsertedMBB)
36345 .addReg(Op2Reg)
36346 .addMBB(ThisMBB);
36347
36348 // The second SecondInsertedMBB provides the same incoming value as the
36349 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36350 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36351
36352 // Now remove the CMOVs.
36353 FirstCMOV.eraseFromParent();
36354 SecondCascadedCMOV.eraseFromParent();
36355
36356 return SinkMBB;
36357}
36358
36360X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36361 MachineBasicBlock *ThisMBB) const {
36362 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36363 const MIMetadata MIMD(MI);
36364
36365 // To "insert" a SELECT_CC instruction, we actually have to insert the
36366 // diamond control-flow pattern. The incoming instruction knows the
36367 // destination vreg to set, the condition code register to branch on, the
36368 // true/false values to select between and a branch opcode to use.
36369
36370 // ThisMBB:
36371 // ...
36372 // TrueVal = ...
36373 // cmpTY ccX, r1, r2
36374 // bCC copy1MBB
36375 // fallthrough --> FalseMBB
36376
36377 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36378 // as described above, by inserting a BB, and then making a PHI at the join
36379 // point to select the true and false operands of the CMOV in the PHI.
36380 //
36381 // The code also handles two different cases of multiple CMOV opcodes
36382 // in a row.
36383 //
36384 // Case 1:
36385 // In this case, there are multiple CMOVs in a row, all which are based on
36386 // the same condition setting (or the exact opposite condition setting).
36387 // In this case we can lower all the CMOVs using a single inserted BB, and
36388 // then make a number of PHIs at the join point to model the CMOVs. The only
36389 // trickiness here, is that in a case like:
36390 //
36391 // t2 = CMOV cond1 t1, f1
36392 // t3 = CMOV cond1 t2, f2
36393 //
36394 // when rewriting this into PHIs, we have to perform some renaming on the
36395 // temps since you cannot have a PHI operand refer to a PHI result earlier
36396 // in the same block. The "simple" but wrong lowering would be:
36397 //
36398 // t2 = PHI t1(BB1), f1(BB2)
36399 // t3 = PHI t2(BB1), f2(BB2)
36400 //
36401 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36402 // renaming is to note that on the path through BB1, t2 is really just a
36403 // copy of t1, and do that renaming, properly generating:
36404 //
36405 // t2 = PHI t1(BB1), f1(BB2)
36406 // t3 = PHI t1(BB1), f2(BB2)
36407 //
36408 // Case 2:
36409 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36410 // function - EmitLoweredCascadedSelect.
36411
36412 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36414 MachineInstr *LastCMOV = &MI;
36416
36417 // Check for case 1, where there are multiple CMOVs with the same condition
36418 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36419 // number of jumps the most.
36420
36421 if (isCMOVPseudo(MI)) {
36422 // See if we have a string of CMOVS with the same condition. Skip over
36423 // intervening debug insts.
36424 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36425 (NextMIIt->getOperand(3).getImm() == CC ||
36426 NextMIIt->getOperand(3).getImm() == OppCC)) {
36427 LastCMOV = &*NextMIIt;
36428 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36429 }
36430 }
36431
36432 // This checks for case 2, but only do this if we didn't already find
36433 // case 1, as indicated by LastCMOV == MI.
36434 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36435 NextMIIt->getOpcode() == MI.getOpcode() &&
36436 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36437 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36438 NextMIIt->getOperand(1).isKill()) {
36439 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36440 }
36441
36442 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36443 MachineFunction *F = ThisMBB->getParent();
36444 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36445 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36446
36447 MachineFunction::iterator It = ++ThisMBB->getIterator();
36448 F->insert(It, FalseMBB);
36449 F->insert(It, SinkMBB);
36450
36451 // Set the call frame size on entry to the new basic blocks.
36452 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36453 FalseMBB->setCallFrameSize(CallFrameSize);
36454 SinkMBB->setCallFrameSize(CallFrameSize);
36455
36456 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36457 // live into the sink and copy blocks.
36458 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36459 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36460 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36461 FalseMBB->addLiveIn(X86::EFLAGS);
36462 SinkMBB->addLiveIn(X86::EFLAGS);
36463 }
36464
36465 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36467 MachineBasicBlock::iterator(LastCMOV));
36468 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36469 if (MI.isDebugInstr())
36470 SinkMBB->push_back(MI.removeFromParent());
36471
36472 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36473 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36474 std::next(MachineBasicBlock::iterator(LastCMOV)),
36475 ThisMBB->end());
36476 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36477
36478 // Fallthrough block for ThisMBB.
36479 ThisMBB->addSuccessor(FalseMBB);
36480 // The true block target of the first (or only) branch is always a SinkMBB.
36481 ThisMBB->addSuccessor(SinkMBB);
36482 // Fallthrough block for FalseMBB.
36483 FalseMBB->addSuccessor(SinkMBB);
36484
36485 // Create the conditional branch instruction.
36486 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36487
36488 // SinkMBB:
36489 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36490 // ...
36493 std::next(MachineBasicBlock::iterator(LastCMOV));
36494 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36495
36496 // Now remove the CMOV(s).
36497 ThisMBB->erase(MIItBegin, MIItEnd);
36498
36499 return SinkMBB;
36500}
36501
36502static unsigned getSUBriOpcode(bool IsLP64) {
36503 if (IsLP64)
36504 return X86::SUB64ri32;
36505 else
36506 return X86::SUB32ri;
36507}
36508
36510X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36511 MachineBasicBlock *MBB) const {
36512 MachineFunction *MF = MBB->getParent();
36513 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36514 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36515 const MIMetadata MIMD(MI);
36516 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36517
36518 const unsigned ProbeSize = getStackProbeSize(*MF);
36519
36520 MachineRegisterInfo &MRI = MF->getRegInfo();
36521 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36522 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36523 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36524
36526 MF->insert(MBBIter, testMBB);
36527 MF->insert(MBBIter, blockMBB);
36528 MF->insert(MBBIter, tailMBB);
36529
36530 Register sizeVReg = MI.getOperand(1).getReg();
36531
36532 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36533
36534 Register TmpStackPtr = MRI.createVirtualRegister(
36535 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36536 Register FinalStackPtr = MRI.createVirtualRegister(
36537 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36538
36539 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36540 .addReg(physSPReg);
36541 {
36542 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36543 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36544 .addReg(TmpStackPtr)
36545 .addReg(sizeVReg);
36546 }
36547
36548 // test rsp size
36549
36550 BuildMI(testMBB, MIMD,
36551 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36552 .addReg(FinalStackPtr)
36553 .addReg(physSPReg);
36554
36555 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36556 .addMBB(tailMBB)
36558 testMBB->addSuccessor(blockMBB);
36559 testMBB->addSuccessor(tailMBB);
36560
36561 // Touch the block then extend it. This is done on the opposite side of
36562 // static probe where we allocate then touch, to avoid the need of probing the
36563 // tail of the static alloca. Possible scenarios are:
36564 //
36565 // + ---- <- ------------ <- ------------- <- ------------ +
36566 // | |
36567 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36568 // | |
36569 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36570 //
36571 // The property we want to enforce is to never have more than [page alloc] between two probes.
36572
36573 const unsigned XORMIOpc =
36574 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36575 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36576 .addImm(0);
36577
36578 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36579 physSPReg)
36580 .addReg(physSPReg)
36581 .addImm(ProbeSize);
36582
36583 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36584 blockMBB->addSuccessor(testMBB);
36585
36586 // Replace original instruction by the expected stack ptr
36587 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36588 MI.getOperand(0).getReg())
36589 .addReg(FinalStackPtr);
36590
36591 tailMBB->splice(tailMBB->end(), MBB,
36592 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36594 MBB->addSuccessor(testMBB);
36595
36596 // Delete the original pseudo instruction.
36597 MI.eraseFromParent();
36598
36599 // And we're done.
36600 return tailMBB;
36601}
36602
36604X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36605 MachineBasicBlock *BB) const {
36606 MachineFunction *MF = BB->getParent();
36607 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36608 const MIMetadata MIMD(MI);
36609 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36610
36611 assert(MF->shouldSplitStack());
36612
36613 const bool Is64Bit = Subtarget.is64Bit();
36614 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36615
36616 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36617 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36618
36619 // BB:
36620 // ... [Till the alloca]
36621 // If stacklet is not large enough, jump to mallocMBB
36622 //
36623 // bumpMBB:
36624 // Allocate by subtracting from RSP
36625 // Jump to continueMBB
36626 //
36627 // mallocMBB:
36628 // Allocate by call to runtime
36629 //
36630 // continueMBB:
36631 // ...
36632 // [rest of original BB]
36633 //
36634
36635 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36636 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36637 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36638
36639 MachineRegisterInfo &MRI = MF->getRegInfo();
36640 const TargetRegisterClass *AddrRegClass =
36642
36643 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36644 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36645 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36646 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36647 sizeVReg = MI.getOperand(1).getReg(),
36648 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36649
36650 MachineFunction::iterator MBBIter = ++BB->getIterator();
36651
36652 MF->insert(MBBIter, bumpMBB);
36653 MF->insert(MBBIter, mallocMBB);
36654 MF->insert(MBBIter, continueMBB);
36655
36656 continueMBB->splice(continueMBB->begin(), BB,
36657 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36658 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36659
36660 // Add code to the main basic block to check if the stack limit has been hit,
36661 // and if so, jump to mallocMBB otherwise to bumpMBB.
36662 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36663 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36664 .addReg(tmpSPVReg).addReg(sizeVReg);
36665 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36666 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36667 .addReg(SPLimitVReg);
36668 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36669
36670 // bumpMBB simply decreases the stack pointer, since we know the current
36671 // stacklet has enough space.
36672 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36673 .addReg(SPLimitVReg);
36674 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36675 .addReg(SPLimitVReg);
36676 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36677
36678 // Calls into a routine in libgcc to allocate more space from the heap.
36679 const uint32_t *RegMask =
36680 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36681 if (IsLP64) {
36682 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36683 .addReg(sizeVReg);
36684 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36685 .addExternalSymbol("__morestack_allocate_stack_space")
36686 .addRegMask(RegMask)
36687 .addReg(X86::RDI, RegState::Implicit)
36688 .addReg(X86::RAX, RegState::ImplicitDefine);
36689 } else if (Is64Bit) {
36690 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36691 .addReg(sizeVReg);
36692 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36693 .addExternalSymbol("__morestack_allocate_stack_space")
36694 .addRegMask(RegMask)
36695 .addReg(X86::EDI, RegState::Implicit)
36696 .addReg(X86::EAX, RegState::ImplicitDefine);
36697 } else {
36698 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36699 .addImm(12);
36700 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36701 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36702 .addExternalSymbol("__morestack_allocate_stack_space")
36703 .addRegMask(RegMask)
36704 .addReg(X86::EAX, RegState::ImplicitDefine);
36705 }
36706
36707 if (!Is64Bit)
36708 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36709 .addImm(16);
36710
36711 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36712 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36713 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36714
36715 // Set up the CFG correctly.
36716 BB->addSuccessor(bumpMBB);
36717 BB->addSuccessor(mallocMBB);
36718 mallocMBB->addSuccessor(continueMBB);
36719 bumpMBB->addSuccessor(continueMBB);
36720
36721 // Take care of the PHI nodes.
36722 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36723 MI.getOperand(0).getReg())
36724 .addReg(mallocPtrVReg)
36725 .addMBB(mallocMBB)
36726 .addReg(bumpSPPtrVReg)
36727 .addMBB(bumpMBB);
36728
36729 // Delete the original pseudo instruction.
36730 MI.eraseFromParent();
36731
36732 // And we're done.
36733 return continueMBB;
36734}
36735
36737X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36738 MachineBasicBlock *BB) const {
36739 MachineFunction *MF = BB->getParent();
36740 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36741 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36742 const MIMetadata MIMD(MI);
36743
36746 "SEH does not use catchret!");
36747
36748 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36749 if (!Subtarget.is32Bit())
36750 return BB;
36751
36752 // C++ EH creates a new target block to hold the restore code, and wires up
36753 // the new block to the return destination with a normal JMP_4.
36754 MachineBasicBlock *RestoreMBB =
36756 assert(BB->succ_size() == 1);
36757 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36758 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36759 BB->addSuccessor(RestoreMBB);
36760 MI.getOperand(0).setMBB(RestoreMBB);
36761
36762 // Marking this as an EH pad but not a funclet entry block causes PEI to
36763 // restore stack pointers in the block.
36764 RestoreMBB->setIsEHPad(true);
36765
36766 auto RestoreMBBI = RestoreMBB->begin();
36767 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36768 return BB;
36769}
36770
36772X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36773 MachineBasicBlock *BB) const {
36774 // This is pretty easy. We're taking the value that we received from
36775 // our load from the relocation, sticking it in either RDI (x86-64)
36776 // or EAX and doing an indirect call. The return value will then
36777 // be in the normal return register.
36778 MachineFunction *F = BB->getParent();
36779 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36780 const MIMetadata MIMD(MI);
36781
36782 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36783 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36784
36785 // Get a register mask for the lowered call.
36786 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36787 // proper register mask.
36788 const uint32_t *RegMask =
36789 Subtarget.is64Bit() ?
36790 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36791 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36792 if (Subtarget.is64Bit()) {
36793 MachineInstrBuilder MIB =
36794 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36795 .addReg(X86::RIP)
36796 .addImm(0)
36797 .addReg(0)
36798 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36799 MI.getOperand(3).getTargetFlags())
36800 .addReg(0);
36801 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36802 addDirectMem(MIB, X86::RDI);
36803 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36804 } else if (!isPositionIndependent()) {
36805 MachineInstrBuilder MIB =
36806 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36807 .addReg(0)
36808 .addImm(0)
36809 .addReg(0)
36810 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36811 MI.getOperand(3).getTargetFlags())
36812 .addReg(0);
36813 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36814 addDirectMem(MIB, X86::EAX);
36815 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36816 } else {
36817 MachineInstrBuilder MIB =
36818 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36819 .addReg(TII->getGlobalBaseReg(F))
36820 .addImm(0)
36821 .addReg(0)
36822 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36823 MI.getOperand(3).getTargetFlags())
36824 .addReg(0);
36825 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36826 addDirectMem(MIB, X86::EAX);
36827 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36828 }
36829
36830 MI.eraseFromParent(); // The pseudo instruction is gone now.
36831 return BB;
36832}
36833
36834static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36835 switch (RPOpc) {
36836 case X86::INDIRECT_THUNK_CALL32:
36837 return X86::CALLpcrel32;
36838 case X86::INDIRECT_THUNK_CALL64:
36839 return X86::CALL64pcrel32;
36840 case X86::INDIRECT_THUNK_TCRETURN32:
36841 return X86::TCRETURNdi;
36842 case X86::INDIRECT_THUNK_TCRETURN64:
36843 return X86::TCRETURNdi64;
36844 }
36845 llvm_unreachable("not indirect thunk opcode");
36846}
36847
36848static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36849 Register Reg) {
36850 if (Subtarget.useRetpolineExternalThunk()) {
36851 // When using an external thunk for retpolines, we pick names that match the
36852 // names GCC happens to use as well. This helps simplify the implementation
36853 // of the thunks for kernels where they have no easy ability to create
36854 // aliases and are doing non-trivial configuration of the thunk's body. For
36855 // example, the Linux kernel will do boot-time hot patching of the thunk
36856 // bodies and cannot easily export aliases of these to loaded modules.
36857 //
36858 // Note that at any point in the future, we may need to change the semantics
36859 // of how we implement retpolines and at that time will likely change the
36860 // name of the called thunk. Essentially, there is no hard guarantee that
36861 // LLVM will generate calls to specific thunks, we merely make a best-effort
36862 // attempt to help out kernels and other systems where duplicating the
36863 // thunks is costly.
36864 switch (Reg.id()) {
36865 case X86::EAX:
36866 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36867 return "__x86_indirect_thunk_eax";
36868 case X86::ECX:
36869 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36870 return "__x86_indirect_thunk_ecx";
36871 case X86::EDX:
36872 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36873 return "__x86_indirect_thunk_edx";
36874 case X86::EDI:
36875 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36876 return "__x86_indirect_thunk_edi";
36877 case X86::R11:
36878 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36879 return "__x86_indirect_thunk_r11";
36880 }
36881 llvm_unreachable("unexpected reg for external indirect thunk");
36882 }
36883
36884 if (Subtarget.useRetpolineIndirectCalls() ||
36885 Subtarget.useRetpolineIndirectBranches()) {
36886 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36887 switch (Reg.id()) {
36888 case X86::EAX:
36889 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36890 return "__llvm_retpoline_eax";
36891 case X86::ECX:
36892 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36893 return "__llvm_retpoline_ecx";
36894 case X86::EDX:
36895 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36896 return "__llvm_retpoline_edx";
36897 case X86::EDI:
36898 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36899 return "__llvm_retpoline_edi";
36900 case X86::R11:
36901 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36902 return "__llvm_retpoline_r11";
36903 }
36904 llvm_unreachable("unexpected reg for retpoline");
36905 }
36906
36907 if (Subtarget.useLVIControlFlowIntegrity()) {
36908 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36909 return "__llvm_lvi_thunk_r11";
36910 }
36911 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36912}
36913
36915X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36916 MachineBasicBlock *BB) const {
36917 // Copy the virtual register into the R11 physical register and
36918 // call the retpoline thunk.
36919 const MIMetadata MIMD(MI);
36920 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36921 Register CalleeVReg = MI.getOperand(0).getReg();
36922 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36923
36924 // Find an available scratch register to hold the callee. On 64-bit, we can
36925 // just use R11, but we scan for uses anyway to ensure we don't generate
36926 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36927 // already a register use operand to the call to hold the callee. If none
36928 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36929 // register and ESI is the base pointer to realigned stack frames with VLAs.
36930 SmallVector<Register, 3> AvailableRegs;
36931 if (Subtarget.is64Bit())
36932 AvailableRegs.push_back(X86::R11);
36933 else
36934 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36935
36936 // Zero out any registers that are already used.
36937 for (const auto &MO : MI.operands()) {
36938 if (MO.isReg() && MO.isUse())
36939 llvm::replace(AvailableRegs, MO.getReg(), Register());
36940 }
36941
36942 // Choose the first remaining non-zero available register.
36943 Register AvailableReg;
36944 for (Register MaybeReg : AvailableRegs) {
36945 if (MaybeReg) {
36946 AvailableReg = MaybeReg;
36947 break;
36948 }
36949 }
36950 if (!AvailableReg)
36951 report_fatal_error("calling convention incompatible with retpoline, no "
36952 "available registers");
36953
36954 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36955
36956 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36957 .addReg(CalleeVReg);
36958 MI.getOperand(0).ChangeToES(Symbol);
36959 MI.setDesc(TII->get(Opc));
36960 MachineInstrBuilder(*BB->getParent(), &MI)
36961 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36962 return BB;
36963}
36964
36965/// SetJmp implies future control flow change upon calling the corresponding
36966/// LongJmp.
36967/// Instead of using the 'return' instruction, the long jump fixes the stack and
36968/// performs an indirect branch. To do so it uses the registers that were stored
36969/// in the jump buffer (when calling SetJmp).
36970/// In case the shadow stack is enabled we need to fix it as well, because some
36971/// return addresses will be skipped.
36972/// The function will save the SSP for future fixing in the function
36973/// emitLongJmpShadowStackFix.
36974/// \sa emitLongJmpShadowStackFix
36975/// \param [in] MI The temporary Machine Instruction for the builtin.
36976/// \param [in] MBB The Machine Basic Block that will be modified.
36977void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36978 MachineBasicBlock *MBB) const {
36979 const MIMetadata MIMD(MI);
36980 MachineFunction *MF = MBB->getParent();
36981 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36982 MachineRegisterInfo &MRI = MF->getRegInfo();
36983 MachineInstrBuilder MIB;
36984
36985 // Memory Reference.
36986 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36987
36988 // Initialize a register with zero.
36989 MVT PVT = getPointerTy(MF->getDataLayout());
36990 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36991 Register ZReg = MRI.createVirtualRegister(PtrRC);
36992 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36993 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36994 .addDef(ZReg)
36995 .addReg(ZReg, RegState::Undef)
36996 .addReg(ZReg, RegState::Undef);
36997
36998 // Read the current SSP Register value to the zeroed register.
36999 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37000 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37001 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37002
37003 // Write the SSP register value to offset 3 in input memory buffer.
37004 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37005 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
37006 const int64_t SSPOffset = 3 * PVT.getStoreSize();
37007 const unsigned MemOpndSlot = 1;
37008 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37009 if (i == X86::AddrDisp)
37010 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37011 else
37012 MIB.add(MI.getOperand(MemOpndSlot + i));
37013 }
37014 MIB.addReg(SSPCopyReg);
37015 MIB.setMemRefs(MMOs);
37016}
37017
37019X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37020 MachineBasicBlock *MBB) const {
37021 const MIMetadata MIMD(MI);
37022 MachineFunction *MF = MBB->getParent();
37023 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37024 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37025 MachineRegisterInfo &MRI = MF->getRegInfo();
37026
37027 const BasicBlock *BB = MBB->getBasicBlock();
37029
37030 // Memory Reference
37031 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37032
37033 unsigned MemOpndSlot = 0;
37034
37035 unsigned CurOp = 0;
37036
37037 Register DstReg = MI.getOperand(CurOp++).getReg();
37038 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37039 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37040 (void)TRI;
37041 Register mainDstReg = MRI.createVirtualRegister(RC);
37042 Register restoreDstReg = MRI.createVirtualRegister(RC);
37043
37044 MemOpndSlot = CurOp;
37045
37046 MVT PVT = getPointerTy(MF->getDataLayout());
37047 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37048 "Invalid Pointer Size!");
37049
37050 // For v = setjmp(buf), we generate
37051 //
37052 // thisMBB:
37053 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37054 // SjLjSetup restoreMBB
37055 //
37056 // mainMBB:
37057 // v_main = 0
37058 //
37059 // sinkMBB:
37060 // v = phi(main, restore)
37061 //
37062 // restoreMBB:
37063 // if base pointer being used, load it from frame
37064 // v_restore = 1
37065
37066 MachineBasicBlock *thisMBB = MBB;
37067 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37068 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37069 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37070 MF->insert(I, mainMBB);
37071 MF->insert(I, sinkMBB);
37072 MF->push_back(restoreMBB);
37073 restoreMBB->setMachineBlockAddressTaken();
37074
37075 MachineInstrBuilder MIB;
37076
37077 // Transfer the remainder of BB and its successor edges to sinkMBB.
37078 sinkMBB->splice(sinkMBB->begin(), MBB,
37079 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37081
37082 // thisMBB:
37083 unsigned PtrStoreOpc = 0;
37084 Register LabelReg;
37085 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37086 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37088
37089 // Prepare IP either in reg or imm.
37090 if (!UseImmLabel) {
37091 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37092 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37093 LabelReg = MRI.createVirtualRegister(PtrRC);
37094 if (Subtarget.is64Bit()) {
37095 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37096 .addReg(X86::RIP)
37097 .addImm(0)
37098 .addReg(0)
37099 .addMBB(restoreMBB)
37100 .addReg(0);
37101 } else {
37102 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37103 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37104 .addReg(XII->getGlobalBaseReg(MF))
37105 .addImm(0)
37106 .addReg(0)
37107 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37108 .addReg(0);
37109 }
37110 } else
37111 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37112 // Store IP
37113 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37114 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37115 if (i == X86::AddrDisp)
37116 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37117 else
37118 MIB.add(MI.getOperand(MemOpndSlot + i));
37119 }
37120 if (!UseImmLabel)
37121 MIB.addReg(LabelReg);
37122 else
37123 MIB.addMBB(restoreMBB);
37124 MIB.setMemRefs(MMOs);
37125
37126 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37127 emitSetJmpShadowStackFix(MI, thisMBB);
37128 }
37129
37130 // Setup
37131 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37132 .addMBB(restoreMBB);
37133
37134 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37135 MIB.addRegMask(RegInfo->getNoPreservedMask());
37136 thisMBB->addSuccessor(mainMBB);
37137 thisMBB->addSuccessor(restoreMBB);
37138
37139 // mainMBB:
37140 // EAX = 0
37141 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37142 mainMBB->addSuccessor(sinkMBB);
37143
37144 // sinkMBB:
37145 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37146 .addReg(mainDstReg)
37147 .addMBB(mainMBB)
37148 .addReg(restoreDstReg)
37149 .addMBB(restoreMBB);
37150
37151 // restoreMBB:
37152 if (RegInfo->hasBasePointer(*MF)) {
37153 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37154 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37155 X86FI->setRestoreBasePointer(MF);
37156 Register FramePtr = RegInfo->getFrameRegister(*MF);
37157 Register BasePtr = RegInfo->getBaseRegister();
37158 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37159 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37160 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37162 }
37163 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37164 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37165 restoreMBB->addSuccessor(sinkMBB);
37166
37167 MI.eraseFromParent();
37168 return sinkMBB;
37169}
37170
37171/// Fix the shadow stack using the previously saved SSP pointer.
37172/// \sa emitSetJmpShadowStackFix
37173/// \param [in] MI The temporary Machine Instruction for the builtin.
37174/// \param [in] MBB The Machine Basic Block that will be modified.
37175/// \return The sink MBB that will perform the future indirect branch.
37177X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37178 MachineBasicBlock *MBB) const {
37179 const MIMetadata MIMD(MI);
37180 MachineFunction *MF = MBB->getParent();
37181 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37182 MachineRegisterInfo &MRI = MF->getRegInfo();
37183
37184 // Memory Reference
37185 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37186
37187 MVT PVT = getPointerTy(MF->getDataLayout());
37188 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37189
37190 // checkSspMBB:
37191 // xor vreg1, vreg1
37192 // rdssp vreg1
37193 // test vreg1, vreg1
37194 // je sinkMBB # Jump if Shadow Stack is not supported
37195 // fallMBB:
37196 // mov buf+24/12(%rip), vreg2
37197 // sub vreg1, vreg2
37198 // jbe sinkMBB # No need to fix the Shadow Stack
37199 // fixShadowMBB:
37200 // shr 3/2, vreg2
37201 // incssp vreg2 # fix the SSP according to the lower 8 bits
37202 // shr 8, vreg2
37203 // je sinkMBB
37204 // fixShadowLoopPrepareMBB:
37205 // shl vreg2
37206 // mov 128, vreg3
37207 // fixShadowLoopMBB:
37208 // incssp vreg3
37209 // dec vreg2
37210 // jne fixShadowLoopMBB # Iterate until you finish fixing
37211 // # the Shadow Stack
37212 // sinkMBB:
37213
37215 const BasicBlock *BB = MBB->getBasicBlock();
37216
37217 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37218 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37219 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37220 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37221 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37222 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37223 MF->insert(I, checkSspMBB);
37224 MF->insert(I, fallMBB);
37225 MF->insert(I, fixShadowMBB);
37226 MF->insert(I, fixShadowLoopPrepareMBB);
37227 MF->insert(I, fixShadowLoopMBB);
37228 MF->insert(I, sinkMBB);
37229
37230 // Transfer the remainder of BB and its successor edges to sinkMBB.
37231 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37232 MBB->end());
37234
37235 MBB->addSuccessor(checkSspMBB);
37236
37237 // Initialize a register with zero.
37238 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37239 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37240
37241 if (PVT == MVT::i64) {
37242 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37243 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37244 .addImm(0)
37245 .addReg(ZReg)
37246 .addImm(X86::sub_32bit);
37247 ZReg = TmpZReg;
37248 }
37249
37250 // Read the current SSP Register value to the zeroed register.
37251 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37252 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37253 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37254
37255 // Check whether the result of the SSP register is zero and jump directly
37256 // to the sink.
37257 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37258 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37259 .addReg(SSPCopyReg)
37260 .addReg(SSPCopyReg);
37261 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37262 .addMBB(sinkMBB)
37264 checkSspMBB->addSuccessor(sinkMBB);
37265 checkSspMBB->addSuccessor(fallMBB);
37266
37267 // Reload the previously saved SSP register value.
37268 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37269 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37270 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37271 MachineInstrBuilder MIB =
37272 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37273 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37274 const MachineOperand &MO = MI.getOperand(i);
37275 if (i == X86::AddrDisp)
37276 MIB.addDisp(MO, SPPOffset);
37277 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37278 // preserve kill flags.
37279 MIB.addReg(MO.getReg());
37280 else
37281 MIB.add(MO);
37282 }
37283 MIB.setMemRefs(MMOs);
37284
37285 // Subtract the current SSP from the previous SSP.
37286 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37287 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37288 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37289 .addReg(PrevSSPReg)
37290 .addReg(SSPCopyReg);
37291
37292 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37293 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37294 .addMBB(sinkMBB)
37296 fallMBB->addSuccessor(sinkMBB);
37297 fallMBB->addSuccessor(fixShadowMBB);
37298
37299 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37300 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37301 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37302 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37303 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37304 .addReg(SspSubReg)
37305 .addImm(Offset);
37306
37307 // Increase SSP when looking only on the lower 8 bits of the delta.
37308 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37309 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37310
37311 // Reset the lower 8 bits.
37312 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37313 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37314 .addReg(SspFirstShrReg)
37315 .addImm(8);
37316
37317 // Jump if the result of the shift is zero.
37318 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37319 .addMBB(sinkMBB)
37321 fixShadowMBB->addSuccessor(sinkMBB);
37322 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37323
37324 // Do a single shift left.
37325 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37326 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37327 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37328 .addReg(SspSecondShrReg)
37329 .addImm(1);
37330
37331 // Save the value 128 to a register (will be used next with incssp).
37332 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37333 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37334 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37335 .addImm(128);
37336 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37337
37338 // Since incssp only looks at the lower 8 bits, we might need to do several
37339 // iterations of incssp until we finish fixing the shadow stack.
37340 Register DecReg = MRI.createVirtualRegister(PtrRC);
37341 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37342 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37343 .addReg(SspAfterShlReg)
37344 .addMBB(fixShadowLoopPrepareMBB)
37345 .addReg(DecReg)
37346 .addMBB(fixShadowLoopMBB);
37347
37348 // Every iteration we increase the SSP by 128.
37349 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37350
37351 // Every iteration we decrement the counter by 1.
37352 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37353 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37354
37355 // Jump if the counter is not zero yet.
37356 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37357 .addMBB(fixShadowLoopMBB)
37359 fixShadowLoopMBB->addSuccessor(sinkMBB);
37360 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37361
37362 return sinkMBB;
37363}
37364
37366X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37367 MachineBasicBlock *MBB) const {
37368 const MIMetadata MIMD(MI);
37369 MachineFunction *MF = MBB->getParent();
37370 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37371 MachineRegisterInfo &MRI = MF->getRegInfo();
37372
37373 // Memory Reference
37374 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37375
37376 MVT PVT = getPointerTy(MF->getDataLayout());
37377 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37378 "Invalid Pointer Size!");
37379
37380 const TargetRegisterClass *RC =
37381 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37382 Register Tmp = MRI.createVirtualRegister(RC);
37383 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37384 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37385 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37386 Register SP = RegInfo->getStackRegister();
37387
37388 MachineInstrBuilder MIB;
37389
37390 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37391 const int64_t SPOffset = 2 * PVT.getStoreSize();
37392
37393 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37394 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37395
37396 MachineBasicBlock *thisMBB = MBB;
37397
37398 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37399 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37400 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37401 }
37402
37403 // Reload FP
37404 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37405 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37406 const MachineOperand &MO = MI.getOperand(i);
37407 if (MO.isReg()) // Don't add the whole operand, we don't want to
37408 // preserve kill flags.
37409 MIB.addReg(MO.getReg());
37410 else
37411 MIB.add(MO);
37412 }
37413 MIB.setMemRefs(MMOs);
37415
37416 // Reload IP
37417 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37418 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37419 const MachineOperand &MO = MI.getOperand(i);
37420 if (i == X86::AddrDisp)
37421 MIB.addDisp(MO, LabelOffset);
37422 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37423 // preserve kill flags.
37424 MIB.addReg(MO.getReg());
37425 else
37426 MIB.add(MO);
37427 }
37428 MIB.setMemRefs(MMOs);
37429
37430 // Reload SP
37431 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37432 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37433 if (i == X86::AddrDisp)
37434 MIB.addDisp(MI.getOperand(i), SPOffset);
37435 else
37436 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37437 // the last instruction of the expansion.
37438 }
37439 MIB.setMemRefs(MMOs);
37441
37442 // Jump
37443 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37444
37445 MI.eraseFromParent();
37446 return thisMBB;
37447}
37448
37449void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37451 MachineBasicBlock *DispatchBB,
37452 int FI) const {
37453 const MIMetadata MIMD(MI);
37454 MachineFunction *MF = MBB->getParent();
37455 MachineRegisterInfo *MRI = &MF->getRegInfo();
37456 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37457
37458 MVT PVT = getPointerTy(MF->getDataLayout());
37459 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37460
37461 unsigned Op = 0;
37462 Register VR;
37463
37464 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37466
37467 if (UseImmLabel) {
37468 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37469 } else {
37470 const TargetRegisterClass *TRC =
37471 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37472 VR = MRI->createVirtualRegister(TRC);
37473 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37474
37475 if (Subtarget.is64Bit())
37476 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37477 .addReg(X86::RIP)
37478 .addImm(1)
37479 .addReg(0)
37480 .addMBB(DispatchBB)
37481 .addReg(0);
37482 else
37483 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37484 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37485 .addImm(1)
37486 .addReg(0)
37487 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37488 .addReg(0);
37489 }
37490
37491 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37492 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37493 if (UseImmLabel)
37494 MIB.addMBB(DispatchBB);
37495 else
37496 MIB.addReg(VR);
37497}
37498
37500X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37501 MachineBasicBlock *BB) const {
37502 const MIMetadata MIMD(MI);
37503 MachineFunction *MF = BB->getParent();
37504 MachineRegisterInfo *MRI = &MF->getRegInfo();
37505 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37506 int FI = MF->getFrameInfo().getFunctionContextIndex();
37507
37508 // Get a mapping of the call site numbers to all of the landing pads they're
37509 // associated with.
37510 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37511 unsigned MaxCSNum = 0;
37512 for (auto &MBB : *MF) {
37513 if (!MBB.isEHPad())
37514 continue;
37515
37516 MCSymbol *Sym = nullptr;
37517 for (const auto &MI : MBB) {
37518 if (MI.isDebugInstr())
37519 continue;
37520
37521 assert(MI.isEHLabel() && "expected EH_LABEL");
37522 Sym = MI.getOperand(0).getMCSymbol();
37523 break;
37524 }
37525
37526 if (!MF->hasCallSiteLandingPad(Sym))
37527 continue;
37528
37529 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37530 CallSiteNumToLPad[CSI].push_back(&MBB);
37531 MaxCSNum = std::max(MaxCSNum, CSI);
37532 }
37533 }
37534
37535 // Get an ordered list of the machine basic blocks for the jump table.
37536 std::vector<MachineBasicBlock *> LPadList;
37537 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37538 LPadList.reserve(CallSiteNumToLPad.size());
37539
37540 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37541 for (auto &LP : CallSiteNumToLPad[CSI]) {
37542 LPadList.push_back(LP);
37543 InvokeBBs.insert_range(LP->predecessors());
37544 }
37545 }
37546
37547 assert(!LPadList.empty() &&
37548 "No landing pad destinations for the dispatch jump table!");
37549
37550 // Create the MBBs for the dispatch code.
37551
37552 // Shove the dispatch's address into the return slot in the function context.
37553 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37554 DispatchBB->setIsEHPad(true);
37555
37556 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37557 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37558 DispatchBB->addSuccessor(TrapBB);
37559
37560 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37561 DispatchBB->addSuccessor(DispContBB);
37562
37563 // Insert MBBs.
37564 MF->push_back(DispatchBB);
37565 MF->push_back(DispContBB);
37566 MF->push_back(TrapBB);
37567
37568 // Insert code into the entry block that creates and registers the function
37569 // context.
37570 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37571
37572 // Create the jump table and associated information
37573 unsigned JTE = getJumpTableEncoding();
37574 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37575 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37576
37577 const X86RegisterInfo &RI = TII->getRegisterInfo();
37578 // Add a register mask with no preserved registers. This results in all
37579 // registers being marked as clobbered.
37580 if (RI.hasBasePointer(*MF)) {
37581 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37582 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37583 MFI->setRestoreBasePointer(MF);
37584
37585 Register FP = RI.getFrameRegister(*MF);
37586 Register BP = RI.getBaseRegister();
37587 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37588 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37591 } else {
37592 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37594 }
37595
37596 // IReg is used as an index in a memory operand and therefore can't be SP
37597 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37598 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37599 Subtarget.is64Bit() ? 8 : 4);
37600 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37601 .addReg(IReg)
37602 .addImm(LPadList.size());
37603 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37604 .addMBB(TrapBB)
37606
37607 if (Subtarget.is64Bit()) {
37608 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37609 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37610
37611 // leaq .LJTI0_0(%rip), BReg
37612 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37613 .addReg(X86::RIP)
37614 .addImm(1)
37615 .addReg(0)
37616 .addJumpTableIndex(MJTI)
37617 .addReg(0);
37618 // movzx IReg64, IReg
37619 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37620 .addImm(0)
37621 .addReg(IReg)
37622 .addImm(X86::sub_32bit);
37623
37624 switch (JTE) {
37626 // jmpq *(BReg,IReg64,8)
37627 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37628 .addReg(BReg)
37629 .addImm(8)
37630 .addReg(IReg64)
37631 .addImm(0)
37632 .addReg(0);
37633 break;
37635 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37636 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37637 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37638
37639 // movl (BReg,IReg64,4), OReg
37640 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37641 .addReg(BReg)
37642 .addImm(4)
37643 .addReg(IReg64)
37644 .addImm(0)
37645 .addReg(0);
37646 // movsx OReg64, OReg
37647 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37648 .addReg(OReg);
37649 // addq BReg, OReg64, TReg
37650 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37651 .addReg(OReg64)
37652 .addReg(BReg);
37653 // jmpq *TReg
37654 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37655 break;
37656 }
37657 default:
37658 llvm_unreachable("Unexpected jump table encoding");
37659 }
37660 } else {
37661 // jmpl *.LJTI0_0(,IReg,4)
37662 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37663 .addReg(0)
37664 .addImm(4)
37665 .addReg(IReg)
37666 .addJumpTableIndex(MJTI)
37667 .addReg(0);
37668 }
37669
37670 // Add the jump table entries as successors to the MBB.
37671 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37672 for (auto &LP : LPadList)
37673 if (SeenMBBs.insert(LP).second)
37674 DispContBB->addSuccessor(LP);
37675
37676 // N.B. the order the invoke BBs are processed in doesn't matter here.
37678 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37679 for (MachineBasicBlock *MBB : InvokeBBs) {
37680 // Remove the landing pad successor from the invoke block and replace it
37681 // with the new dispatch block.
37682 // Keep a copy of Successors since it's modified inside the loop.
37683 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37684 MBB->succ_rend());
37685 // FIXME: Avoid quadratic complexity.
37686 for (auto *MBBS : Successors) {
37687 if (MBBS->isEHPad()) {
37688 MBB->removeSuccessor(MBBS);
37689 MBBLPads.push_back(MBBS);
37690 }
37691 }
37692
37693 MBB->addSuccessor(DispatchBB);
37694
37695 // Find the invoke call and mark all of the callee-saved registers as
37696 // 'implicit defined' so that they're spilled. This prevents code from
37697 // moving instructions to before the EH block, where they will never be
37698 // executed.
37699 for (auto &II : reverse(*MBB)) {
37700 if (!II.isCall())
37701 continue;
37702
37703 DenseSet<Register> DefRegs;
37704 for (auto &MOp : II.operands())
37705 if (MOp.isReg())
37706 DefRegs.insert(MOp.getReg());
37707
37708 MachineInstrBuilder MIB(*MF, &II);
37709 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37710 Register Reg = SavedRegs[RegIdx];
37711 if (!DefRegs.contains(Reg))
37713 }
37714
37715 break;
37716 }
37717 }
37718
37719 // Mark all former landing pads as non-landing pads. The dispatch is the only
37720 // landing pad now.
37721 for (auto &LP : MBBLPads)
37722 LP->setIsEHPad(false);
37723
37724 // The instruction is gone now.
37725 MI.eraseFromParent();
37726 return BB;
37727}
37728
37730X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37731 MachineBasicBlock *BB) const {
37732 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37733 // calls may require proper stack alignment.
37734 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37735 const MIMetadata MIMD(MI);
37736 MachineFunction &MF = *BB->getParent();
37737
37738 // Emit CALLSEQ_START right before the instruction.
37739 MF.getFrameInfo().setAdjustsStack(true);
37740 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37741 MachineInstrBuilder CallseqStart =
37742 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37743 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37744
37745 // Emit CALLSEQ_END right after the instruction.
37746 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37747 MachineInstrBuilder CallseqEnd =
37748 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37749 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37750
37751 return BB;
37752}
37753
37756 MachineBasicBlock *BB) const {
37757 MachineFunction *MF = BB->getParent();
37758 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37759 const MIMetadata MIMD(MI);
37760
37761 auto TMMImmToTMMReg = [](unsigned Imm) {
37762 assert (Imm < 8 && "Illegal tmm index");
37763 return X86::TMM0 + Imm;
37764 };
37765 auto TMMImmToTMMPair = [](unsigned Imm) {
37766 assert(Imm < 8 && "Illegal tmm pair index.");
37767 return X86::TMM0_TMM1 + Imm / 2;
37768 };
37769 switch (MI.getOpcode()) {
37770 default:
37771 llvm_unreachable("Unexpected instr type to insert");
37772 case X86::INDIRECT_THUNK_CALL32:
37773 case X86::INDIRECT_THUNK_CALL64:
37774 case X86::INDIRECT_THUNK_TCRETURN32:
37775 case X86::INDIRECT_THUNK_TCRETURN64:
37776 return EmitLoweredIndirectThunk(MI, BB);
37777 case X86::CATCHRET:
37778 return EmitLoweredCatchRet(MI, BB);
37779 case X86::SEG_ALLOCA_32:
37780 case X86::SEG_ALLOCA_64:
37781 return EmitLoweredSegAlloca(MI, BB);
37782 case X86::PROBED_ALLOCA_32:
37783 case X86::PROBED_ALLOCA_64:
37784 return EmitLoweredProbedAlloca(MI, BB);
37785 case X86::TLSCall_32:
37786 case X86::TLSCall_64:
37787 return EmitLoweredTLSCall(MI, BB);
37788 case X86::CMOV_FR16:
37789 case X86::CMOV_FR16X:
37790 case X86::CMOV_FR32:
37791 case X86::CMOV_FR32X:
37792 case X86::CMOV_FR64:
37793 case X86::CMOV_FR64X:
37794 case X86::CMOV_GR8:
37795 case X86::CMOV_GR16:
37796 case X86::CMOV_GR32:
37797 case X86::CMOV_RFP32:
37798 case X86::CMOV_RFP64:
37799 case X86::CMOV_RFP80:
37800 case X86::CMOV_VR64:
37801 case X86::CMOV_VR128:
37802 case X86::CMOV_VR128X:
37803 case X86::CMOV_VR256:
37804 case X86::CMOV_VR256X:
37805 case X86::CMOV_VR512:
37806 case X86::CMOV_VK1:
37807 case X86::CMOV_VK2:
37808 case X86::CMOV_VK4:
37809 case X86::CMOV_VK8:
37810 case X86::CMOV_VK16:
37811 case X86::CMOV_VK32:
37812 case X86::CMOV_VK64:
37813 return EmitLoweredSelect(MI, BB);
37814
37815 case X86::FP80_ADDr:
37816 case X86::FP80_ADDm32: {
37817 // Change the floating point control register to use double extended
37818 // precision when performing the addition.
37819 int OrigCWFrameIdx =
37820 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37821 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37822 OrigCWFrameIdx);
37823
37824 // Load the old value of the control word...
37825 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37826 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37827 OrigCWFrameIdx);
37828
37829 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37830 // precision.
37831 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37832 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37833 .addReg(OldCW, RegState::Kill)
37834 .addImm(0x300);
37835
37836 // Extract to 16 bits.
37837 Register NewCW16 =
37838 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37839 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37840 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37841
37842 // Prepare memory for FLDCW.
37843 int NewCWFrameIdx =
37844 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37845 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37846 NewCWFrameIdx)
37847 .addReg(NewCW16, RegState::Kill);
37848
37849 // Reload the modified control word now...
37850 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37851 NewCWFrameIdx);
37852
37853 // Do the addition.
37854 if (MI.getOpcode() == X86::FP80_ADDr) {
37855 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37856 .add(MI.getOperand(0))
37857 .add(MI.getOperand(1))
37858 .add(MI.getOperand(2));
37859 } else {
37860 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37861 .add(MI.getOperand(0))
37862 .add(MI.getOperand(1))
37863 .add(MI.getOperand(2))
37864 .add(MI.getOperand(3))
37865 .add(MI.getOperand(4))
37866 .add(MI.getOperand(5))
37867 .add(MI.getOperand(6));
37868 }
37869
37870 // Reload the original control word now.
37871 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37872 OrigCWFrameIdx);
37873
37874 MI.eraseFromParent(); // The pseudo instruction is gone now.
37875 return BB;
37876 }
37877
37878 case X86::FP32_TO_INT16_IN_MEM:
37879 case X86::FP32_TO_INT32_IN_MEM:
37880 case X86::FP32_TO_INT64_IN_MEM:
37881 case X86::FP64_TO_INT16_IN_MEM:
37882 case X86::FP64_TO_INT32_IN_MEM:
37883 case X86::FP64_TO_INT64_IN_MEM:
37884 case X86::FP80_TO_INT16_IN_MEM:
37885 case X86::FP80_TO_INT32_IN_MEM:
37886 case X86::FP80_TO_INT64_IN_MEM: {
37887 // Change the floating point control register to use "round towards zero"
37888 // mode when truncating to an integer value.
37889 int OrigCWFrameIdx =
37890 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37891 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37892 OrigCWFrameIdx);
37893
37894 // Load the old value of the control word...
37895 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37896 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37897 OrigCWFrameIdx);
37898
37899 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37900 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37901 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37902 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37903
37904 // Extract to 16 bits.
37905 Register NewCW16 =
37906 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37907 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37908 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37909
37910 // Prepare memory for FLDCW.
37911 int NewCWFrameIdx =
37912 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37913 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37914 NewCWFrameIdx)
37915 .addReg(NewCW16, RegState::Kill);
37916
37917 // Reload the modified control word now...
37918 addFrameReference(BuildMI(*BB, MI, MIMD,
37919 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37920
37921 // Get the X86 opcode to use.
37922 unsigned Opc;
37923 switch (MI.getOpcode()) {
37924 // clang-format off
37925 default: llvm_unreachable("illegal opcode!");
37926 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37927 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37928 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37929 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37930 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37931 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37932 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37933 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37934 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37935 // clang-format on
37936 }
37937
37939 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37940 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37941
37942 // Reload the original control word now.
37943 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37944 OrigCWFrameIdx);
37945
37946 MI.eraseFromParent(); // The pseudo instruction is gone now.
37947 return BB;
37948 }
37949
37950 // xbegin
37951 case X86::XBEGIN:
37952 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37953
37954 case X86::VAARG_64:
37955 case X86::VAARG_X32:
37956 return EmitVAARGWithCustomInserter(MI, BB);
37957
37958 case X86::EH_SjLj_SetJmp32:
37959 case X86::EH_SjLj_SetJmp64:
37960 return emitEHSjLjSetJmp(MI, BB);
37961
37962 case X86::EH_SjLj_LongJmp32:
37963 case X86::EH_SjLj_LongJmp64:
37964 return emitEHSjLjLongJmp(MI, BB);
37965
37966 case X86::Int_eh_sjlj_setup_dispatch:
37967 return EmitSjLjDispatchBlock(MI, BB);
37968
37969 case TargetOpcode::STATEPOINT:
37970 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37971 // this point in the process. We diverge later.
37972 return emitPatchPoint(MI, BB);
37973
37974 case TargetOpcode::STACKMAP:
37975 case TargetOpcode::PATCHPOINT:
37976 return emitPatchPoint(MI, BB);
37977
37978 case TargetOpcode::PATCHABLE_EVENT_CALL:
37979 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37980 return emitPatchableEventCall(MI, BB);
37981
37982 case X86::LCMPXCHG8B: {
37983 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37984 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37985 // requires a memory operand. If it happens that current architecture is
37986 // i686 and for current function we need a base pointer
37987 // - which is ESI for i686 - register allocator would not be able to
37988 // allocate registers for an address in form of X(%reg, %reg, Y)
37989 // - there never would be enough unreserved registers during regalloc
37990 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37991 // We are giving a hand to register allocator by precomputing the address in
37992 // a new vreg using LEA.
37993
37994 // If it is not i686 or there is no base pointer - nothing to do here.
37995 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37996 return BB;
37997
37998 // Even though this code does not necessarily needs the base pointer to
37999 // be ESI, we check for that. The reason: if this assert fails, there are
38000 // some changes happened in the compiler base pointer handling, which most
38001 // probably have to be addressed somehow here.
38002 assert(TRI->getBaseRegister() == X86::ESI &&
38003 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38004 "base pointer in mind");
38005
38007 MVT SPTy = getPointerTy(MF->getDataLayout());
38008 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38009 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38010
38012 // Regalloc does not need any help when the memory operand of CMPXCHG8B
38013 // does not use index register.
38014 if (AM.IndexReg == X86::NoRegister)
38015 return BB;
38016
38017 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38018 // four operand definitions that are E[ABCD] registers. We skip them and
38019 // then insert the LEA.
38020 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38021 while (RMBBI != BB->rend() &&
38022 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38023 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38024 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38025 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38026 ++RMBBI;
38027 }
38030 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38031
38032 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38033
38034 return BB;
38035 }
38036 case X86::LCMPXCHG16B_NO_RBX: {
38037 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38038 Register BasePtr = TRI->getBaseRegister();
38039 if (TRI->hasBasePointer(*MF) &&
38040 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38041 if (!BB->isLiveIn(BasePtr))
38042 BB->addLiveIn(BasePtr);
38043 // Save RBX into a virtual register.
38044 Register SaveRBX =
38045 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38046 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38047 .addReg(X86::RBX);
38048 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38050 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38051 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38052 MIB.add(MI.getOperand(Idx));
38053 MIB.add(MI.getOperand(X86::AddrNumOperands));
38054 MIB.addReg(SaveRBX);
38055 } else {
38056 // Simple case, just copy the virtual register to RBX.
38057 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38058 .add(MI.getOperand(X86::AddrNumOperands));
38060 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38061 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38062 MIB.add(MI.getOperand(Idx));
38063 }
38064 MI.eraseFromParent();
38065 return BB;
38066 }
38067 case X86::MWAITX: {
38068 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38069 Register BasePtr = TRI->getBaseRegister();
38070 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38071 // If no need to save the base pointer, we generate MWAITXrrr,
38072 // else we generate pseudo MWAITX_SAVE_RBX.
38073 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38074 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38075 .addReg(MI.getOperand(0).getReg());
38076 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38077 .addReg(MI.getOperand(1).getReg());
38078 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38079 .addReg(MI.getOperand(2).getReg());
38080 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38081 MI.eraseFromParent();
38082 } else {
38083 if (!BB->isLiveIn(BasePtr)) {
38084 BB->addLiveIn(BasePtr);
38085 }
38086 // Parameters can be copied into ECX and EAX but not EBX yet.
38087 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38088 .addReg(MI.getOperand(0).getReg());
38089 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38090 .addReg(MI.getOperand(1).getReg());
38091 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38092 // Save RBX into a virtual register.
38093 Register SaveRBX =
38094 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38095 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38096 .addReg(X86::RBX);
38097 // Generate mwaitx pseudo.
38098 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38099 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38100 .addDef(Dst) // Destination tied in with SaveRBX.
38101 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38102 .addUse(SaveRBX); // Save of base pointer.
38103 MI.eraseFromParent();
38104 }
38105 return BB;
38106 }
38107 case TargetOpcode::PREALLOCATED_SETUP: {
38108 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38109 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38110 MFI->setHasPreallocatedCall(true);
38111 int64_t PreallocatedId = MI.getOperand(0).getImm();
38112 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38113 assert(StackAdjustment != 0 && "0 stack adjustment");
38114 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38115 << StackAdjustment << "\n");
38116 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38117 .addReg(X86::ESP)
38118 .addImm(StackAdjustment);
38119 MI.eraseFromParent();
38120 return BB;
38121 }
38122 case TargetOpcode::PREALLOCATED_ARG: {
38123 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38124 int64_t PreallocatedId = MI.getOperand(1).getImm();
38125 int64_t ArgIdx = MI.getOperand(2).getImm();
38126 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38127 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38128 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38129 << ", arg offset " << ArgOffset << "\n");
38130 // stack pointer + offset
38131 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38132 MI.getOperand(0).getReg()),
38133 X86::ESP, false, ArgOffset);
38134 MI.eraseFromParent();
38135 return BB;
38136 }
38137 case X86::PTDPBSSD:
38138 case X86::PTDPBSUD:
38139 case X86::PTDPBUSD:
38140 case X86::PTDPBUUD:
38141 case X86::PTDPBF16PS:
38142 case X86::PTDPFP16PS:
38143 case X86::PTCMMIMFP16PS:
38144 case X86::PTCMMRLFP16PS:
38145 case X86::PTDPBF8PS:
38146 case X86::PTDPBHF8PS:
38147 case X86::PTDPHBF8PS:
38148 case X86::PTDPHF8PS:
38149 case X86::PTTDPBF16PS:
38150 case X86::PTTDPFP16PS:
38151 case X86::PTTCMMIMFP16PS:
38152 case X86::PTTCMMRLFP16PS:
38153 case X86::PTCONJTCMMIMFP16PS:
38154 case X86::PTMMULTF32PS:
38155 case X86::PTTMMULTF32PS: {
38156 unsigned Opc;
38157 switch (MI.getOpcode()) {
38158 default: llvm_unreachable("illegal opcode!");
38159 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38160 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38161 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38162 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38163 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38164 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38165 case X86::PTCMMIMFP16PS:
38166 Opc = X86::TCMMIMFP16PS;
38167 break;
38168 case X86::PTCMMRLFP16PS:
38169 Opc = X86::TCMMRLFP16PS;
38170 break;
38171 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38172 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38173 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38174 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38175 case X86::PTTDPBF16PS:
38176 Opc = X86::TTDPBF16PS;
38177 break;
38178 case X86::PTTDPFP16PS:
38179 Opc = X86::TTDPFP16PS;
38180 break;
38181 case X86::PTTCMMIMFP16PS:
38182 Opc = X86::TTCMMIMFP16PS;
38183 break;
38184 case X86::PTTCMMRLFP16PS:
38185 Opc = X86::TTCMMRLFP16PS;
38186 break;
38187 case X86::PTCONJTCMMIMFP16PS:
38188 Opc = X86::TCONJTCMMIMFP16PS;
38189 break;
38190 case X86::PTMMULTF32PS:
38191 Opc = X86::TMMULTF32PS;
38192 break;
38193 case X86::PTTMMULTF32PS:
38194 Opc = X86::TTMMULTF32PS;
38195 break;
38196 }
38197
38198 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38199 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38200 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38201 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38202 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38203
38204 MI.eraseFromParent(); // The pseudo is gone now.
38205 return BB;
38206 }
38207 case X86::PTILEZERO: {
38208 unsigned Imm = MI.getOperand(0).getImm();
38209 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38210 MI.eraseFromParent(); // The pseudo is gone now.
38211 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38213 return BB;
38214 }
38215 case X86::PTILEZEROV: {
38216 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38218 return BB;
38219 }
38220 case X86::PTILELOADDRS:
38221 case X86::PTILELOADDRST1:
38222 case X86::PTILELOADD:
38223 case X86::PTILELOADDT1:
38224 case X86::PTILESTORED: {
38225 unsigned Opc;
38226 switch (MI.getOpcode()) {
38227 default: llvm_unreachable("illegal opcode!");
38228#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38229 case X86::PTILELOADD:
38230 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38231 break;
38232 case X86::PTILELOADDT1:
38233 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38234 break;
38235 case X86::PTILESTORED:
38236 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38237 break;
38238 case X86::PTILELOADDRS:
38239 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38240 break;
38241 case X86::PTILELOADDRST1:
38242 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38243 break;
38244 }
38245#undef GET_EGPR_IF_ENABLED
38246
38247 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38248 unsigned CurOp = 0;
38249 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38250 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38252
38253 MIB.add(MI.getOperand(CurOp++)); // base
38254 MIB.add(MI.getOperand(CurOp++)); // scale
38255 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38256 MIB.add(MI.getOperand(CurOp++)); // displacement
38257 MIB.add(MI.getOperand(CurOp++)); // segment
38258
38259 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38260 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38262
38263 MI.eraseFromParent(); // The pseudo is gone now.
38264 return BB;
38265 }
38266 case X86::PT2RPNTLVWZ0:
38267 case X86::PT2RPNTLVWZ0T1:
38268 case X86::PT2RPNTLVWZ1:
38269 case X86::PT2RPNTLVWZ1T1:
38270 case X86::PT2RPNTLVWZ0RS:
38271 case X86::PT2RPNTLVWZ0RST1:
38272 case X86::PT2RPNTLVWZ1RS:
38273 case X86::PT2RPNTLVWZ1RST1: {
38274 const DebugLoc &DL = MI.getDebugLoc();
38275 unsigned Opc;
38276#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38277 switch (MI.getOpcode()) {
38278 default:
38279 llvm_unreachable("Unexpected instruction!");
38280 case X86::PT2RPNTLVWZ0:
38281 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38282 break;
38283 case X86::PT2RPNTLVWZ0T1:
38284 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38285 break;
38286 case X86::PT2RPNTLVWZ1:
38287 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38288 break;
38289 case X86::PT2RPNTLVWZ1T1:
38290 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38291 break;
38292 case X86::PT2RPNTLVWZ0RS:
38293 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38294 break;
38295 case X86::PT2RPNTLVWZ0RST1:
38296 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38297 break;
38298 case X86::PT2RPNTLVWZ1RS:
38299 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38300 break;
38301 case X86::PT2RPNTLVWZ1RST1:
38302 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38303 break;
38304 }
38305#undef GET_EGPR_IF_ENABLED
38306 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38307 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38308
38309 MIB.add(MI.getOperand(1)); // base
38310 MIB.add(MI.getOperand(2)); // scale
38311 MIB.add(MI.getOperand(3)); // index
38312 MIB.add(MI.getOperand(4)); // displacement
38313 MIB.add(MI.getOperand(5)); // segment
38314 MI.eraseFromParent(); // The pseudo is gone now.
38315 return BB;
38316 }
38317 case X86::PTTRANSPOSED:
38318 case X86::PTCONJTFP16: {
38319 const DebugLoc &DL = MI.getDebugLoc();
38320 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38321 : X86::TCONJTFP16;
38322
38323 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38324 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38325 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38326
38327 MI.eraseFromParent(); // The pseudo is gone now.
38328 return BB;
38329 }
38330 case X86::PTCVTROWPS2BF16Hrri:
38331 case X86::PTCVTROWPS2BF16Lrri:
38332 case X86::PTCVTROWPS2PHHrri:
38333 case X86::PTCVTROWPS2PHLrri:
38334 case X86::PTCVTROWD2PSrri:
38335 case X86::PTILEMOVROWrri: {
38336 const DebugLoc &DL = MI.getDebugLoc();
38337 unsigned Opc;
38338 switch (MI.getOpcode()) {
38339 default:
38340 llvm_unreachable("Unexpected instruction!");
38341 case X86::PTCVTROWD2PSrri:
38342 Opc = X86::TCVTROWD2PSrri;
38343 break;
38344 case X86::PTCVTROWPS2BF16Hrri:
38345 Opc = X86::TCVTROWPS2BF16Hrri;
38346 break;
38347 case X86::PTCVTROWPS2PHHrri:
38348 Opc = X86::TCVTROWPS2PHHrri;
38349 break;
38350 case X86::PTCVTROWPS2BF16Lrri:
38351 Opc = X86::TCVTROWPS2BF16Lrri;
38352 break;
38353 case X86::PTCVTROWPS2PHLrri:
38354 Opc = X86::TCVTROWPS2PHLrri;
38355 break;
38356 case X86::PTILEMOVROWrri:
38357 Opc = X86::TILEMOVROWrri;
38358 break;
38359 }
38360 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38361 MIB.add(MI.getOperand(0));
38362 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38363 MIB.addImm(MI.getOperand(2).getImm());
38364
38365 MI.eraseFromParent(); // The pseudo is gone now.
38366 return BB;
38367 }
38368 case X86::PTCVTROWPS2BF16Hrre:
38369 case X86::PTCVTROWPS2BF16Lrre:
38370 case X86::PTCVTROWPS2PHHrre:
38371 case X86::PTCVTROWPS2PHLrre:
38372 case X86::PTCVTROWD2PSrre:
38373 case X86::PTILEMOVROWrre: {
38374 const DebugLoc &DL = MI.getDebugLoc();
38375 unsigned Opc;
38376 switch (MI.getOpcode()) {
38377 default:
38378 llvm_unreachable("Unexpected instruction!");
38379 case X86::PTCVTROWD2PSrre:
38380 Opc = X86::TCVTROWD2PSrre;
38381 break;
38382 case X86::PTCVTROWPS2BF16Hrre:
38383 Opc = X86::TCVTROWPS2BF16Hrre;
38384 break;
38385 case X86::PTCVTROWPS2BF16Lrre:
38386 Opc = X86::TCVTROWPS2BF16Lrre;
38387 break;
38388 case X86::PTCVTROWPS2PHHrre:
38389 Opc = X86::TCVTROWPS2PHHrre;
38390 break;
38391 case X86::PTCVTROWPS2PHLrre:
38392 Opc = X86::TCVTROWPS2PHLrre;
38393 break;
38394 case X86::PTILEMOVROWrre:
38395 Opc = X86::TILEMOVROWrre;
38396 break;
38397 }
38398 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38399 MIB.add(MI.getOperand(0));
38400 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38401 MIB.add(MI.getOperand(2));
38402
38403 MI.eraseFromParent(); // The pseudo is gone now.
38404 return BB;
38405 }
38406 }
38407}
38408
38409//===----------------------------------------------------------------------===//
38410// X86 Optimization Hooks
38411//===----------------------------------------------------------------------===//
38412
38413bool
38415 const APInt &DemandedBits,
38416 const APInt &DemandedElts,
38417 TargetLoweringOpt &TLO) const {
38418 EVT VT = Op.getValueType();
38419 unsigned Opcode = Op.getOpcode();
38420 unsigned EltSize = VT.getScalarSizeInBits();
38421
38422 if (VT.isVector()) {
38423 // If the constant is only all signbits in the active bits, then we should
38424 // extend it to the entire constant to allow it act as a boolean constant
38425 // vector.
38426 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38427 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38428 return false;
38429 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38430 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38431 continue;
38432 const APInt &Val = V.getConstantOperandAPInt(i);
38433 if (Val.getBitWidth() > Val.getNumSignBits() &&
38434 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38435 return true;
38436 }
38437 return false;
38438 };
38439 // For vectors - if we have a constant, then try to sign extend.
38440 // TODO: Handle AND cases.
38441 unsigned ActiveBits = DemandedBits.getActiveBits();
38442 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38443 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38444 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38445 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38446 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38448 SDValue NewC =
38450 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38451 SDValue NewOp =
38452 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38453 return TLO.CombineTo(Op, NewOp);
38454 }
38455 return false;
38456 }
38457
38458 // Only optimize Ands to prevent shrinking a constant that could be
38459 // matched by movzx.
38460 if (Opcode != ISD::AND)
38461 return false;
38462
38463 // Make sure the RHS really is a constant.
38464 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38465 if (!C)
38466 return false;
38467
38468 const APInt &Mask = C->getAPIntValue();
38469
38470 // Clear all non-demanded bits initially.
38471 APInt ShrunkMask = Mask & DemandedBits;
38472
38473 // Find the width of the shrunk mask.
38474 unsigned Width = ShrunkMask.getActiveBits();
38475
38476 // If the mask is all 0s there's nothing to do here.
38477 if (Width == 0)
38478 return false;
38479
38480 // Find the next power of 2 width, rounding up to a byte.
38481 Width = llvm::bit_ceil(std::max(Width, 8U));
38482 // Truncate the width to size to handle illegal types.
38483 Width = std::min(Width, EltSize);
38484
38485 // Calculate a possible zero extend mask for this constant.
38486 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38487
38488 // If we aren't changing the mask, just return true to keep it and prevent
38489 // the caller from optimizing.
38490 if (ZeroExtendMask == Mask)
38491 return true;
38492
38493 // Make sure the new mask can be represented by a combination of mask bits
38494 // and non-demanded bits.
38495 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38496 return false;
38497
38498 // Replace the constant with the zero extend mask.
38499 SDLoc DL(Op);
38500 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38501 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38502 return TLO.CombineTo(Op, NewOp);
38503}
38504
38506 KnownBits &Known,
38507 const APInt &DemandedElts,
38508 const SelectionDAG &DAG, unsigned Depth) {
38509 KnownBits Known2;
38510 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38511 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38512 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38513 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38514 Known = KnownBits::abdu(Known, Known2).zext(16);
38515 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38516 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38517 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38518 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38519 Known = Known.zext(64);
38520}
38521
38523 KnownBits &Known,
38524 const APInt &DemandedElts,
38525 const SelectionDAG &DAG,
38526 unsigned Depth) {
38527 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38528
38529 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38530 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38531 APInt DemandedLoElts =
38532 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38533 APInt DemandedHiElts =
38534 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38535 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38536 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38537 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38538 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38539 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38540 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38541 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38542}
38543
38545 KnownBits &Known,
38546 const APInt &DemandedElts,
38547 const SelectionDAG &DAG,
38548 unsigned Depth) {
38549 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38550
38551 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38552 // pairs.
38553 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38554 APInt DemandedLoElts =
38555 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38556 APInt DemandedHiElts =
38557 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38558 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38559 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38560 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38561 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38562 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38563 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38564 Known = KnownBits::sadd_sat(Lo, Hi);
38565}
38566
38568 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38569 const SelectionDAG &DAG,
38570 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38571 KnownBitsFunc) {
38572 APInt DemandedEltsLHS, DemandedEltsRHS;
38573 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38574 DemandedElts, DemandedEltsLHS,
38575 DemandedEltsRHS);
38576
38577 const auto ComputeForSingleOpFunc =
38578 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38579 return KnownBitsFunc(
38580 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38581 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38582 };
38583
38584 if (DemandedEltsRHS.isZero())
38585 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38586 if (DemandedEltsLHS.isZero())
38587 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38588
38589 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38590 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38591}
38592
38594 KnownBits &Known,
38595 const APInt &DemandedElts,
38596 const SelectionDAG &DAG,
38597 unsigned Depth) const {
38598 unsigned BitWidth = Known.getBitWidth();
38599 unsigned NumElts = DemandedElts.getBitWidth();
38600 unsigned Opc = Op.getOpcode();
38601 EVT VT = Op.getValueType();
38606 "Should use MaskedValueIsZero if you don't know whether Op"
38607 " is a target node!");
38608
38609 Known.resetAll();
38610 switch (Opc) {
38611 default: break;
38612 case X86ISD::MUL_IMM: {
38613 KnownBits Known2;
38614 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38615 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38616 Known = KnownBits::mul(Known, Known2);
38617 break;
38618 }
38619 case X86ISD::BSF: {
38621
38622 KnownBits Known2;
38623 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38624 if (Known2.isNonZero()) {
38625 // If we have a known 1, its position is our upper bound.
38626 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38627 unsigned LowBits = llvm::bit_width(PossibleTZ);
38628 Known.Zero.setBitsFrom(LowBits);
38629 } else if (!Op.getOperand(0).isUndef()) {
38630 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38631 Known = Known.intersectWith(Known2);
38632 }
38633 break;
38634 }
38635 case X86ISD::BSR: {
38636 // TODO: Bound with input known bits?
38638
38639 if (!Op.getOperand(0).isUndef() &&
38640 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38641 KnownBits Known2;
38642 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38643 Known = Known.intersectWith(Known2);
38644 }
38645 break;
38646 }
38647 case X86ISD::SETCC:
38648 Known.Zero.setBitsFrom(1);
38649 break;
38650 case X86ISD::MOVMSK: {
38651 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38652 Known.Zero.setBitsFrom(NumLoBits);
38653 break;
38654 }
38655 case X86ISD::PEXTRB:
38656 case X86ISD::PEXTRW: {
38657 SDValue Src = Op.getOperand(0);
38658 EVT SrcVT = Src.getValueType();
38659 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38660 Op.getConstantOperandVal(1));
38661 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38662 Known = Known.anyextOrTrunc(BitWidth);
38663 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38664 break;
38665 }
38666 case X86ISD::VSRAI:
38667 case X86ISD::VSHLI:
38668 case X86ISD::VSRLI: {
38669 unsigned ShAmt = Op.getConstantOperandVal(1);
38670 if (ShAmt >= VT.getScalarSizeInBits()) {
38671 // Out of range logical bit shifts are guaranteed to be zero.
38672 // Out of range arithmetic bit shifts splat the sign bit.
38673 if (Opc != X86ISD::VSRAI) {
38674 Known.setAllZero();
38675 break;
38676 }
38677
38678 ShAmt = VT.getScalarSizeInBits() - 1;
38679 }
38680
38681 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38682 if (Opc == X86ISD::VSHLI) {
38683 Known <<= ShAmt;
38684 // Low bits are known zero.
38685 Known.Zero.setLowBits(ShAmt);
38686 } else if (Opc == X86ISD::VSRLI) {
38687 Known >>= ShAmt;
38688 // High bits are known zero.
38689 Known.Zero.setHighBits(ShAmt);
38690 } else {
38691 Known.Zero.ashrInPlace(ShAmt);
38692 Known.One.ashrInPlace(ShAmt);
38693 }
38694 break;
38695 }
38696 case X86ISD::PACKUS: {
38697 // PACKUS is just a truncation if the upper half is zero.
38698 APInt DemandedLHS, DemandedRHS;
38699 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38700
38701 Known.One = APInt::getAllOnes(BitWidth * 2);
38702 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38703
38704 KnownBits Known2;
38705 if (!!DemandedLHS) {
38706 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38707 Known = Known.intersectWith(Known2);
38708 }
38709 if (!!DemandedRHS) {
38710 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38711 Known = Known.intersectWith(Known2);
38712 }
38713
38714 if (Known.countMinLeadingZeros() < BitWidth)
38715 Known.resetAll();
38716 Known = Known.trunc(BitWidth);
38717 break;
38718 }
38719 case X86ISD::PSHUFB: {
38720 SDValue Src = Op.getOperand(0);
38721 SDValue Idx = Op.getOperand(1);
38722
38723 // If the index vector is never negative (MSB is zero), then all elements
38724 // come from the source vector. This is useful for cases where
38725 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38726 // below will handle the more common constant shuffle mask case.
38727 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38728 if (KnownIdx.isNonNegative())
38729 Known = DAG.computeKnownBits(Src, Depth + 1);
38730 break;
38731 }
38732 case X86ISD::VBROADCAST: {
38733 SDValue Src = Op.getOperand(0);
38734 if (!Src.getSimpleValueType().isVector()) {
38735 Known = DAG.computeKnownBits(Src, Depth + 1);
38736 return;
38737 }
38738 break;
38739 }
38740 case X86ISD::AND: {
38741 if (Op.getResNo() == 0) {
38742 KnownBits Known2;
38743 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38744 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38745 Known &= Known2;
38746 }
38747 break;
38748 }
38749 case X86ISD::ANDNP: {
38750 KnownBits Known2;
38751 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38752 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38753
38754 // ANDNP = (~X & Y);
38755 Known.One &= Known2.Zero;
38756 Known.Zero |= Known2.One;
38757 break;
38758 }
38759 case X86ISD::FOR: {
38760 KnownBits Known2;
38761 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38762 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38763
38764 Known |= Known2;
38765 break;
38766 }
38767 case X86ISD::PSADBW: {
38768 SDValue LHS = Op.getOperand(0);
38769 SDValue RHS = Op.getOperand(1);
38770 assert(VT.getScalarType() == MVT::i64 &&
38771 LHS.getValueType() == RHS.getValueType() &&
38772 LHS.getValueType().getScalarType() == MVT::i8 &&
38773 "Unexpected PSADBW types");
38774 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38775 break;
38776 }
38777 case X86ISD::PCMPGT:
38778 case X86ISD::PCMPEQ: {
38779 KnownBits KnownLhs =
38780 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38781 KnownBits KnownRhs =
38782 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38783 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38784 ? KnownBits::eq(KnownLhs, KnownRhs)
38785 : KnownBits::sgt(KnownLhs, KnownRhs);
38786 if (Res) {
38787 if (*Res)
38788 Known.setAllOnes();
38789 else
38790 Known.setAllZero();
38791 }
38792 break;
38793 }
38794 case X86ISD::VPMADDWD: {
38795 SDValue LHS = Op.getOperand(0);
38796 SDValue RHS = Op.getOperand(1);
38797 assert(VT.getVectorElementType() == MVT::i32 &&
38798 LHS.getValueType() == RHS.getValueType() &&
38799 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38800 "Unexpected PMADDWD types");
38801 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38802 break;
38803 }
38804 case X86ISD::VPMADDUBSW: {
38805 SDValue LHS = Op.getOperand(0);
38806 SDValue RHS = Op.getOperand(1);
38807 assert(VT.getVectorElementType() == MVT::i16 &&
38808 LHS.getValueType() == RHS.getValueType() &&
38809 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38810 "Unexpected PMADDUBSW types");
38811 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38812 break;
38813 }
38814 case X86ISD::PMULUDQ: {
38815 KnownBits Known2;
38816 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38817 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38818
38819 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38820 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38821 Known = KnownBits::mul(Known, Known2);
38822 break;
38823 }
38824 case X86ISD::CMOV: {
38825 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38826 // If we don't know any bits, early out.
38827 if (Known.isUnknown())
38828 break;
38829 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38830
38831 // Only known if known in both the LHS and RHS.
38832 Known = Known.intersectWith(Known2);
38833 break;
38834 }
38835 case X86ISD::BEXTR:
38836 case X86ISD::BEXTRI: {
38837 SDValue Op0 = Op.getOperand(0);
38838 SDValue Op1 = Op.getOperand(1);
38839
38840 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38841 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38842 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38843
38844 // If the length is 0, the result is 0.
38845 if (Length == 0) {
38846 Known.setAllZero();
38847 break;
38848 }
38849
38850 if ((Shift + Length) <= BitWidth) {
38851 Known = DAG.computeKnownBits(Op0, Depth + 1);
38852 Known = Known.extractBits(Length, Shift);
38853 Known = Known.zextOrTrunc(BitWidth);
38854 }
38855 }
38856 break;
38857 }
38858 case X86ISD::PDEP: {
38859 KnownBits Known2;
38860 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38861 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38862 // Zeros are retained from the mask operand. But not ones.
38863 Known.One.clearAllBits();
38864 // The result will have at least as many trailing zeros as the non-mask
38865 // operand since bits can only map to the same or higher bit position.
38866 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38867 break;
38868 }
38869 case X86ISD::PEXT: {
38870 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38871 // The result has as many leading zeros as the number of zeroes in the mask.
38872 unsigned Count = Known.Zero.popcount();
38874 Known.One.clearAllBits();
38875 break;
38876 }
38877 case X86ISD::VTRUNC:
38878 case X86ISD::VTRUNCS:
38879 case X86ISD::VTRUNCUS:
38880 case X86ISD::CVTSI2P:
38881 case X86ISD::CVTUI2P:
38882 case X86ISD::CVTP2SI:
38883 case X86ISD::CVTP2UI:
38884 case X86ISD::MCVTP2SI:
38885 case X86ISD::MCVTP2UI:
38886 case X86ISD::CVTTP2SI:
38887 case X86ISD::CVTTP2UI:
38888 case X86ISD::MCVTTP2SI:
38889 case X86ISD::MCVTTP2UI:
38890 case X86ISD::MCVTSI2P:
38891 case X86ISD::MCVTUI2P:
38892 case X86ISD::VFPROUND:
38893 case X86ISD::VMFPROUND:
38894 case X86ISD::CVTPS2PH:
38895 case X86ISD::MCVTPS2PH:
38896 case X86ISD::MCVTTP2SIS:
38897 case X86ISD::MCVTTP2UIS: {
38898 // Truncations/Conversions - upper elements are known zero.
38899 EVT SrcVT = Op.getOperand(0).getValueType();
38900 if (SrcVT.isVector()) {
38901 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38902 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38903 Known.setAllZero();
38904 }
38905 break;
38906 }
38913 // Strict Conversions - upper elements are known zero.
38914 EVT SrcVT = Op.getOperand(1).getValueType();
38915 if (SrcVT.isVector()) {
38916 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38917 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38918 Known.setAllZero();
38919 }
38920 break;
38921 }
38922 case X86ISD::MOVQ2DQ: {
38923 // Move from MMX to XMM. Upper half of XMM should be 0.
38924 if (DemandedElts.countr_zero() >= (NumElts / 2))
38925 Known.setAllZero();
38926 break;
38927 }
38929 APInt UndefElts;
38930 SmallVector<APInt, 16> EltBits;
38931 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38932 /*AllowWholeUndefs*/ false,
38933 /*AllowPartialUndefs*/ false)) {
38934 Known.Zero.setAllBits();
38935 Known.One.setAllBits();
38936 for (unsigned I = 0; I != NumElts; ++I) {
38937 if (!DemandedElts[I])
38938 continue;
38939 if (UndefElts[I]) {
38940 Known.resetAll();
38941 break;
38942 }
38943 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38944 Known = Known.intersectWith(Known2);
38945 }
38946 return;
38947 }
38948 break;
38949 }
38950 case X86ISD::HADD:
38951 case X86ISD::HSUB: {
38953 Op, DemandedElts, Depth, DAG,
38954 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38956 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38957 KnownLHS, KnownRHS);
38958 });
38959 break;
38960 }
38962 switch (Op->getConstantOperandVal(0)) {
38963 case Intrinsic::x86_sse2_pmadd_wd:
38964 case Intrinsic::x86_avx2_pmadd_wd:
38965 case Intrinsic::x86_avx512_pmaddw_d_512: {
38966 SDValue LHS = Op.getOperand(1);
38967 SDValue RHS = Op.getOperand(2);
38968 assert(VT.getScalarType() == MVT::i32 &&
38969 LHS.getValueType() == RHS.getValueType() &&
38970 LHS.getValueType().getScalarType() == MVT::i16 &&
38971 "Unexpected PMADDWD types");
38972 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38973 break;
38974 }
38975 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38976 case Intrinsic::x86_avx2_pmadd_ub_sw:
38977 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38978 SDValue LHS = Op.getOperand(1);
38979 SDValue RHS = Op.getOperand(2);
38980 assert(VT.getScalarType() == MVT::i16 &&
38981 LHS.getValueType() == RHS.getValueType() &&
38982 LHS.getValueType().getScalarType() == MVT::i8 &&
38983 "Unexpected PMADDUBSW types");
38984 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38985 break;
38986 }
38987 case Intrinsic::x86_sse2_psad_bw:
38988 case Intrinsic::x86_avx2_psad_bw:
38989 case Intrinsic::x86_avx512_psad_bw_512: {
38990 SDValue LHS = Op.getOperand(1);
38991 SDValue RHS = Op.getOperand(2);
38992 assert(VT.getScalarType() == MVT::i64 &&
38993 LHS.getValueType() == RHS.getValueType() &&
38994 LHS.getValueType().getScalarType() == MVT::i8 &&
38995 "Unexpected PSADBW types");
38996 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38997 break;
38998 }
38999 }
39000 break;
39001 }
39002 }
39003
39004 // Handle target shuffles.
39005 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39006 if (isTargetShuffle(Opc)) {
39009 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39010 unsigned NumOps = Ops.size();
39011 unsigned NumElts = VT.getVectorNumElements();
39012 if (Mask.size() == NumElts) {
39013 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39014 Known.Zero.setAllBits(); Known.One.setAllBits();
39015 for (unsigned i = 0; i != NumElts; ++i) {
39016 if (!DemandedElts[i])
39017 continue;
39018 int M = Mask[i];
39019 if (M == SM_SentinelUndef) {
39020 // For UNDEF elements, we don't know anything about the common state
39021 // of the shuffle result.
39022 Known.resetAll();
39023 break;
39024 }
39025 if (M == SM_SentinelZero) {
39026 Known.One.clearAllBits();
39027 continue;
39028 }
39029 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39030 "Shuffle index out of range");
39031
39032 unsigned OpIdx = (unsigned)M / NumElts;
39033 unsigned EltIdx = (unsigned)M % NumElts;
39034 if (Ops[OpIdx].getValueType() != VT) {
39035 // TODO - handle target shuffle ops with different value types.
39036 Known.resetAll();
39037 break;
39038 }
39039 DemandedOps[OpIdx].setBit(EltIdx);
39040 }
39041 // Known bits are the values that are shared by every demanded element.
39042 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39043 if (!DemandedOps[i])
39044 continue;
39045 KnownBits Known2 =
39046 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39047 Known = Known.intersectWith(Known2);
39048 }
39049 }
39050 }
39051 }
39052}
39053
39055 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39056 unsigned Depth) const {
39057 EVT VT = Op.getValueType();
39058 unsigned VTBits = VT.getScalarSizeInBits();
39059 unsigned Opcode = Op.getOpcode();
39060 switch (Opcode) {
39062 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39063 return VTBits;
39064
39065 case X86ISD::VTRUNC: {
39066 SDValue Src = Op.getOperand(0);
39067 MVT SrcVT = Src.getSimpleValueType();
39068 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39069 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39070 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39071 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39072 if (Tmp > (NumSrcBits - VTBits))
39073 return Tmp - (NumSrcBits - VTBits);
39074 return 1;
39075 }
39076
39077 case X86ISD::PACKSS: {
39078 // PACKSS is just a truncation if the sign bits extend to the packed size.
39079 APInt DemandedLHS, DemandedRHS;
39080 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39081 DemandedRHS);
39082
39083 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39084 // patterns often used to compact vXi64 allsignbit patterns.
39085 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39087 if (BC.getOpcode() == X86ISD::PACKSS &&
39088 BC.getScalarValueSizeInBits() == 16 &&
39089 V.getScalarValueSizeInBits() == 32) {
39092 if (BC0.getScalarValueSizeInBits() == 64 &&
39093 BC1.getScalarValueSizeInBits() == 64 &&
39094 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39095 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39096 return 32;
39097 }
39098 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39099 };
39100
39101 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39102 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39103 if (!!DemandedLHS)
39104 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39105 if (!!DemandedRHS)
39106 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39107 unsigned Tmp = std::min(Tmp0, Tmp1);
39108 if (Tmp > (SrcBits - VTBits))
39109 return Tmp - (SrcBits - VTBits);
39110 return 1;
39111 }
39112
39113 case X86ISD::VBROADCAST: {
39114 SDValue Src = Op.getOperand(0);
39115 if (!Src.getSimpleValueType().isVector())
39116 return DAG.ComputeNumSignBits(Src, Depth + 1);
39117 break;
39118 }
39119
39120 case X86ISD::VSHLI: {
39121 SDValue Src = Op.getOperand(0);
39122 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39123 if (ShiftVal.uge(VTBits))
39124 return VTBits; // Shifted all bits out --> zero.
39125 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39126 if (ShiftVal.uge(Tmp))
39127 return 1; // Shifted all sign bits out --> unknown.
39128 return Tmp - ShiftVal.getZExtValue();
39129 }
39130
39131 case X86ISD::VSRAI: {
39132 SDValue Src = Op.getOperand(0);
39133 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39134 if (ShiftVal.uge(VTBits - 1))
39135 return VTBits; // Sign splat.
39136 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39137 ShiftVal += Tmp;
39138 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39139 }
39140
39141 case X86ISD::FSETCC:
39142 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39143 if (VT == MVT::f32 || VT == MVT::f64 ||
39144 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39145 return VTBits;
39146 break;
39147
39148 case X86ISD::PCMPGT:
39149 case X86ISD::PCMPEQ:
39150 case X86ISD::CMPP:
39151 case X86ISD::VPCOM:
39152 case X86ISD::VPCOMU:
39153 // Vector compares return zero/all-bits result values.
39154 return VTBits;
39155
39156 case X86ISD::ANDNP: {
39157 unsigned Tmp0 =
39158 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39159 if (Tmp0 == 1) return 1; // Early out.
39160 unsigned Tmp1 =
39161 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39162 return std::min(Tmp0, Tmp1);
39163 }
39164
39165 case X86ISD::CMOV: {
39166 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39167 if (Tmp0 == 1) return 1; // Early out.
39168 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39169 return std::min(Tmp0, Tmp1);
39170 }
39171 }
39172
39173 // Handle target shuffles.
39174 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39175 if (isTargetShuffle(Opcode)) {
39178 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39179 unsigned NumOps = Ops.size();
39180 unsigned NumElts = VT.getVectorNumElements();
39181 if (Mask.size() == NumElts) {
39182 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39183 for (unsigned i = 0; i != NumElts; ++i) {
39184 if (!DemandedElts[i])
39185 continue;
39186 int M = Mask[i];
39187 if (M == SM_SentinelUndef) {
39188 // For UNDEF elements, we don't know anything about the common state
39189 // of the shuffle result.
39190 return 1;
39191 } else if (M == SM_SentinelZero) {
39192 // Zero = all sign bits.
39193 continue;
39194 }
39195 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39196 "Shuffle index out of range");
39197
39198 unsigned OpIdx = (unsigned)M / NumElts;
39199 unsigned EltIdx = (unsigned)M % NumElts;
39200 if (Ops[OpIdx].getValueType() != VT) {
39201 // TODO - handle target shuffle ops with different value types.
39202 return 1;
39203 }
39204 DemandedOps[OpIdx].setBit(EltIdx);
39205 }
39206 unsigned Tmp0 = VTBits;
39207 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39208 if (!DemandedOps[i])
39209 continue;
39210 unsigned Tmp1 =
39211 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39212 Tmp0 = std::min(Tmp0, Tmp1);
39213 }
39214 return Tmp0;
39215 }
39216 }
39217 }
39218
39219 // Fallback case.
39220 return 1;
39221}
39222
39224 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39225 return N->getOperand(0);
39226 return N;
39227}
39228
39229// Helper to look for a normal load that can be narrowed into a vzload with the
39230// specified VT and memory VT. Returns SDValue() on failure.
39232 SelectionDAG &DAG) {
39233 // Can't if the load is volatile or atomic.
39234 if (!LN->isSimple())
39235 return SDValue();
39236
39237 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39238 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39239 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39240 LN->getPointerInfo(), LN->getBaseAlign(),
39241 LN->getMemOperand()->getFlags());
39242}
39243
39244// Attempt to match a combined shuffle mask against supported unary shuffle
39245// instructions.
39246// TODO: Investigate sharing more of this with shuffle lowering.
39247static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39248 bool AllowFloatDomain, bool AllowIntDomain,
39249 SDValue V1, const SelectionDAG &DAG,
39250 const X86Subtarget &Subtarget, unsigned &Shuffle,
39251 MVT &SrcVT, MVT &DstVT) {
39252 unsigned NumMaskElts = Mask.size();
39253 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39254
39255 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39256 if (Mask[0] == 0 &&
39257 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39258 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39260 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39261 Shuffle = X86ISD::VZEXT_MOVL;
39262 if (MaskEltSize == 16)
39263 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39264 else
39265 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39266 return true;
39267 }
39268 }
39269
39270 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39271 if (AllowIntDomain &&
39272 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39273 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39274 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39275 unsigned MaxScale = 64 / MaskEltSize;
39276 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39277 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39278 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39279 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39280 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39281 continue;
39282 bool MatchAny = true;
39283 bool MatchZero = true;
39284 bool MatchSign = UseSign;
39285 unsigned NumDstElts = NumMaskElts / Scale;
39286 for (unsigned i = 0;
39287 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39288 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39289 MatchAny = MatchSign = MatchZero = false;
39290 break;
39291 }
39292 unsigned Pos = (i * Scale) + 1;
39293 unsigned Len = Scale - 1;
39294 MatchAny &= isUndefInRange(Mask, Pos, Len);
39295 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39296 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39297 }
39298 if (MatchAny || MatchSign || MatchZero) {
39299 assert((MatchSign || MatchZero) &&
39300 "Failed to match sext/zext but matched aext?");
39301 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39302 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39303 : MVT::getIntegerVT(MaskEltSize);
39304 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39305
39306 Shuffle = unsigned(
39307 MatchAny ? ISD::ANY_EXTEND
39308 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39309 if (SrcVT.getVectorNumElements() != NumDstElts)
39310 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39311
39312 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39313 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39314 return true;
39315 }
39316 }
39317 }
39318
39319 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39320 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39321 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39322 isUndefOrEqual(Mask[0], 0) &&
39323 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39324 Shuffle = X86ISD::VZEXT_MOVL;
39325 if (MaskEltSize == 16)
39326 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39327 else
39328 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39329 return true;
39330 }
39331
39332 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39333 // instructions are no slower than UNPCKLPD but has the option to
39334 // fold the input operand into even an unaligned memory load.
39335 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39336 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39337 Shuffle = X86ISD::MOVDDUP;
39338 SrcVT = DstVT = MVT::v2f64;
39339 return true;
39340 }
39341 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39342 Shuffle = X86ISD::MOVSLDUP;
39343 SrcVT = DstVT = MVT::v4f32;
39344 return true;
39345 }
39346 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39347 Shuffle = X86ISD::MOVSHDUP;
39348 SrcVT = DstVT = MVT::v4f32;
39349 return true;
39350 }
39351 }
39352
39353 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39354 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39355 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39356 Shuffle = X86ISD::MOVDDUP;
39357 SrcVT = DstVT = MVT::v4f64;
39358 return true;
39359 }
39360 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39361 V1)) {
39362 Shuffle = X86ISD::MOVSLDUP;
39363 SrcVT = DstVT = MVT::v8f32;
39364 return true;
39365 }
39366 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39367 V1)) {
39368 Shuffle = X86ISD::MOVSHDUP;
39369 SrcVT = DstVT = MVT::v8f32;
39370 return true;
39371 }
39372 }
39373
39374 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39375 assert(Subtarget.hasAVX512() &&
39376 "AVX512 required for 512-bit vector shuffles");
39377 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39378 V1)) {
39379 Shuffle = X86ISD::MOVDDUP;
39380 SrcVT = DstVT = MVT::v8f64;
39381 return true;
39382 }
39384 MaskVT, Mask,
39385 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39386 Shuffle = X86ISD::MOVSLDUP;
39387 SrcVT = DstVT = MVT::v16f32;
39388 return true;
39389 }
39391 MaskVT, Mask,
39392 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39393 Shuffle = X86ISD::MOVSHDUP;
39394 SrcVT = DstVT = MVT::v16f32;
39395 return true;
39396 }
39397 }
39398
39399 return false;
39400}
39401
39402// Attempt to match a combined shuffle mask against supported unary immediate
39403// permute instructions.
39404// TODO: Investigate sharing more of this with shuffle lowering.
39406 const APInt &Zeroable,
39407 bool AllowFloatDomain, bool AllowIntDomain,
39408 const SelectionDAG &DAG,
39409 const X86Subtarget &Subtarget,
39410 unsigned &Shuffle, MVT &ShuffleVT,
39411 unsigned &PermuteImm) {
39412 unsigned NumMaskElts = Mask.size();
39413 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39414 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39415 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39416 bool ContainsZeros = isAnyZero(Mask);
39417
39418 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39419 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39420 // Check for lane crossing permutes.
39421 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39422 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39423 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39424 Shuffle = X86ISD::VPERMI;
39425 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39426 PermuteImm = getV4X86ShuffleImm(Mask);
39427 return true;
39428 }
39429 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39430 SmallVector<int, 4> RepeatedMask;
39431 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39432 Shuffle = X86ISD::VPERMI;
39433 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39434 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39435 return true;
39436 }
39437 }
39438 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39439 // VPERMILPD can permute with a non-repeating shuffle.
39440 Shuffle = X86ISD::VPERMILPI;
39441 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39442 PermuteImm = 0;
39443 for (int i = 0, e = Mask.size(); i != e; ++i) {
39444 int M = Mask[i];
39445 if (M == SM_SentinelUndef)
39446 continue;
39447 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39448 PermuteImm |= (M & 1) << i;
39449 }
39450 return true;
39451 }
39452 }
39453
39454 // We are checking for shuffle match or shift match. Loop twice so we can
39455 // order which we try and match first depending on target preference.
39456 for (unsigned Order = 0; Order < 2; ++Order) {
39457 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39458 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39459 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39460 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39461 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39462 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39463 SmallVector<int, 4> RepeatedMask;
39464 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39465 // Narrow the repeated mask to create 32-bit element permutes.
39466 SmallVector<int, 4> WordMask = RepeatedMask;
39467 if (MaskScalarSizeInBits == 64)
39468 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39469
39470 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39471 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39472 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39473 PermuteImm = getV4X86ShuffleImm(WordMask);
39474 return true;
39475 }
39476 }
39477
39478 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39479 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39480 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39481 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39482 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39483 SmallVector<int, 4> RepeatedMask;
39484 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39485 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39486 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39487
39488 // PSHUFLW: permute lower 4 elements only.
39489 if (isUndefOrInRange(LoMask, 0, 4) &&
39490 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39491 Shuffle = X86ISD::PSHUFLW;
39492 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39493 PermuteImm = getV4X86ShuffleImm(LoMask);
39494 return true;
39495 }
39496
39497 // PSHUFHW: permute upper 4 elements only.
39498 if (isUndefOrInRange(HiMask, 4, 8) &&
39499 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39500 // Offset the HiMask so that we can create the shuffle immediate.
39501 int OffsetHiMask[4];
39502 for (int i = 0; i != 4; ++i)
39503 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39504
39505 Shuffle = X86ISD::PSHUFHW;
39506 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39507 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39508 return true;
39509 }
39510 }
39511 }
39512 } else {
39513 // Attempt to match against bit rotates.
39514 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39515 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39516 Subtarget.hasAVX512())) {
39517 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39518 Subtarget, Mask);
39519 if (0 < RotateAmt) {
39520 Shuffle = X86ISD::VROTLI;
39521 PermuteImm = (unsigned)RotateAmt;
39522 return true;
39523 }
39524 }
39525 }
39526 // Attempt to match against byte/bit shifts.
39527 if (AllowIntDomain &&
39528 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39529 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39530 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39531 int ShiftAmt =
39532 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39533 Zeroable, Subtarget);
39534 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39535 32 <= ShuffleVT.getScalarSizeInBits())) {
39536 // Byte shifts can be slower so only match them on second attempt.
39537 if (Order == 0 &&
39538 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39539 continue;
39540
39541 PermuteImm = (unsigned)ShiftAmt;
39542 return true;
39543 }
39544
39545 }
39546 }
39547
39548 return false;
39549}
39550
39551// Attempt to match a combined unary shuffle mask against supported binary
39552// shuffle instructions.
39553// TODO: Investigate sharing more of this with shuffle lowering.
39554static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39555 bool AllowFloatDomain, bool AllowIntDomain,
39556 SDValue &V1, SDValue &V2, const SDLoc &DL,
39557 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39558 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39559 bool IsUnary) {
39560 unsigned NumMaskElts = Mask.size();
39561 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39562 unsigned SizeInBits = MaskVT.getSizeInBits();
39563
39564 if (MaskVT.is128BitVector()) {
39565 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39566 AllowFloatDomain) {
39567 V2 = V1;
39568 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39569 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39570 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39571 return true;
39572 }
39573 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39574 AllowFloatDomain) {
39575 V2 = V1;
39576 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39577 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39578 return true;
39579 }
39580 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39581 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39582 std::swap(V1, V2);
39583 Shuffle = X86ISD::MOVSD;
39584 SrcVT = DstVT = MVT::v2f64;
39585 return true;
39586 }
39587 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39588 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39589 Shuffle = X86ISD::MOVSS;
39590 SrcVT = DstVT = MVT::v4f32;
39591 return true;
39592 }
39593 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39594 DAG) &&
39595 Subtarget.hasFP16()) {
39596 Shuffle = X86ISD::MOVSH;
39597 SrcVT = DstVT = MVT::v8f16;
39598 return true;
39599 }
39600 }
39601
39602 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39603 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39604 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39605 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39606 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39607 Subtarget)) {
39608 DstVT = MaskVT;
39609 return true;
39610 }
39611 }
39612 // TODO: Can we handle this inside matchShuffleWithPACK?
39613 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39614 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39615 V1.getScalarValueSizeInBits() == 64 &&
39616 V2.getScalarValueSizeInBits() == 64) {
39617 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39618 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39619 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39620 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39621 SrcVT = MVT::v4i32;
39622 DstVT = MVT::v8i16;
39623 Shuffle = X86ISD::PACKUS;
39624 return true;
39625 }
39626 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39627 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39628 SrcVT = MVT::v8i16;
39629 DstVT = MVT::v16i8;
39630 Shuffle = X86ISD::PACKUS;
39631 return true;
39632 }
39633 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39634 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39635 SrcVT = MVT::v4i32;
39636 DstVT = MVT::v8i16;
39637 Shuffle = X86ISD::PACKSS;
39638 return true;
39639 }
39640 }
39641
39642 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39643 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39644 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39645 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39646 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39647 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39648 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39649 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39650 Subtarget)) {
39651 SrcVT = DstVT = MaskVT;
39652 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39653 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39654 return true;
39655 }
39656 }
39657
39658 // Attempt to match against a OR if we're performing a blend shuffle and the
39659 // non-blended source element is zero in each case.
39660 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39661 if (SizeInBits == V1.getValueSizeInBits() &&
39662 SizeInBits == V2.getValueSizeInBits() &&
39663 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39664 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39665 bool IsBlend = true;
39666 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39667 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39668 unsigned Scale1 = NumV1Elts / NumMaskElts;
39669 unsigned Scale2 = NumV2Elts / NumMaskElts;
39670 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39671 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39672 for (unsigned i = 0; i != NumMaskElts; ++i) {
39673 int M = Mask[i];
39674 if (M == SM_SentinelUndef)
39675 continue;
39676 if (M == SM_SentinelZero) {
39677 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39678 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39679 continue;
39680 }
39681 if (M == (int)i) {
39682 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39683 continue;
39684 }
39685 if (M == (int)(i + NumMaskElts)) {
39686 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39687 continue;
39688 }
39689 IsBlend = false;
39690 break;
39691 }
39692 if (IsBlend) {
39693 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39694 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39695 Shuffle = ISD::OR;
39696 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39697 return true;
39698 }
39699 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39700 // FIXME: handle mismatched sizes?
39701 // TODO: investigate if `ISD::OR` handling in
39702 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39703 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39704 unsigned NumElts = V.getValueType().getVectorNumElements();
39705 KnownBits Known(NumElts);
39706 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39707 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39708 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39709 if (PeepholeKnown.isZero())
39710 Known.Zero.setBit(EltIdx);
39711 if (PeepholeKnown.isAllOnes())
39712 Known.One.setBit(EltIdx);
39713 }
39714 return Known;
39715 };
39716
39717 KnownBits V1Known = computeKnownBitsElementWise(V1);
39718 KnownBits V2Known = computeKnownBitsElementWise(V2);
39719
39720 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39721 int M = Mask[i];
39722 if (M == SM_SentinelUndef)
39723 continue;
39724 if (M == SM_SentinelZero) {
39725 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39726 continue;
39727 }
39728 if (M == (int)i) {
39729 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39730 continue;
39731 }
39732 if (M == (int)(i + NumMaskElts)) {
39733 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39734 continue;
39735 }
39736 llvm_unreachable("will not get here.");
39737 }
39738 if (IsBlend) {
39739 Shuffle = ISD::OR;
39740 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39741 return true;
39742 }
39743 }
39744 }
39745 }
39746
39747 return false;
39748}
39749
39751 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39752 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39753 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39754 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39755 unsigned NumMaskElts = Mask.size();
39756 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39757
39758 // Attempt to match against VALIGND/VALIGNQ rotate.
39759 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39760 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39761 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39762 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39763 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39764 MaskVT.getSizeInBits() / EltSizeInBits);
39765 if (!isAnyZero(Mask)) {
39766 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39767 if (0 < Rotation) {
39768 Shuffle = X86ISD::VALIGN;
39769 ShuffleVT = AlignVT;
39770 PermuteImm = Rotation;
39771 return true;
39772 }
39773 }
39774 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39775 unsigned ZeroLo = Zeroable.countr_one();
39776 unsigned ZeroHi = Zeroable.countl_one();
39777 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39778 if (ZeroLo) {
39779 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39780 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39781 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39782 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39783 Shuffle = X86ISD::VALIGN;
39784 ShuffleVT = AlignVT;
39785 PermuteImm = NumMaskElts - ZeroLo;
39786 return true;
39787 }
39788 }
39789 if (ZeroHi) {
39790 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39791 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39792 ZeroHi);
39793 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39794 V2 = V1;
39795 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39796 Shuffle = X86ISD::VALIGN;
39797 ShuffleVT = AlignVT;
39798 PermuteImm = ZeroHi;
39799 return true;
39800 }
39801 }
39802 }
39803
39804 // Attempt to match against PALIGNR byte rotate.
39805 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39806 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39807 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39808 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39809 if (0 < ByteRotation) {
39810 Shuffle = X86ISD::PALIGNR;
39811 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39812 PermuteImm = ByteRotation;
39813 return true;
39814 }
39815 }
39816
39817 // Attempt to combine to X86ISD::BLENDI.
39818 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39819 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39820 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39821 uint64_t BlendMask = 0;
39822 bool ForceV1Zero = false, ForceV2Zero = false;
39823 SmallVector<int, 8> TargetMask(Mask);
39824 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39825 ForceV2Zero, BlendMask)) {
39826 if (MaskVT == MVT::v16i16) {
39827 // We can only use v16i16 PBLENDW if the lanes are repeated.
39828 SmallVector<int, 8> RepeatedMask;
39829 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39830 RepeatedMask)) {
39831 assert(RepeatedMask.size() == 8 &&
39832 "Repeated mask size doesn't match!");
39833 PermuteImm = 0;
39834 for (int i = 0; i < 8; ++i)
39835 if (RepeatedMask[i] >= 8)
39836 PermuteImm |= 1 << i;
39837 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39838 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39839 Shuffle = X86ISD::BLENDI;
39840 ShuffleVT = MaskVT;
39841 return true;
39842 }
39843 } else {
39844 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39845 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39846 PermuteImm = (unsigned)BlendMask;
39847 Shuffle = X86ISD::BLENDI;
39848 ShuffleVT = MaskVT;
39849 return true;
39850 }
39851 }
39852 }
39853
39854 // Attempt to combine to INSERTPS, but only if it has elements that need to
39855 // be set to zero.
39856 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39857 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39858 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39859 Shuffle = X86ISD::INSERTPS;
39860 ShuffleVT = MVT::v4f32;
39861 return true;
39862 }
39863
39864 // Attempt to combine to SHUFPD.
39865 if (AllowFloatDomain && EltSizeInBits == 64 &&
39866 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39867 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39868 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39869 bool ForceV1Zero = false, ForceV2Zero = false;
39870 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39871 PermuteImm, Mask, Zeroable)) {
39872 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39873 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39874 Shuffle = X86ISD::SHUFP;
39875 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39876 return true;
39877 }
39878 }
39879
39880 // Attempt to combine to SHUFPS.
39881 if (AllowFloatDomain && EltSizeInBits == 32 &&
39882 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39883 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39884 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39885 SmallVector<int, 4> RepeatedMask;
39886 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39887 // Match each half of the repeated mask, to determine if its just
39888 // referencing one of the vectors, is zeroable or entirely undef.
39889 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39890 int M0 = RepeatedMask[Offset];
39891 int M1 = RepeatedMask[Offset + 1];
39892
39893 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39894 return DAG.getUNDEF(MaskVT);
39895 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39896 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39897 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39898 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39899 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39900 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39901 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39902 return V1;
39903 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39904 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39905 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39906 return V2;
39907 }
39908
39909 return SDValue();
39910 };
39911
39912 int ShufMask[4] = {-1, -1, -1, -1};
39913 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39914 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39915
39916 if (Lo && Hi) {
39917 V1 = Lo;
39918 V2 = Hi;
39919 Shuffle = X86ISD::SHUFP;
39920 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39921 PermuteImm = getV4X86ShuffleImm(ShufMask);
39922 return true;
39923 }
39924 }
39925 }
39926
39927 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39928 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39929 MaskVT.is128BitVector() &&
39930 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39931 Shuffle = X86ISD::INSERTPS;
39932 ShuffleVT = MVT::v4f32;
39933 return true;
39934 }
39935
39936 return false;
39937}
39938
39940 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39941 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39942 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39943 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39944 const X86Subtarget &Subtarget);
39945
39946/// Combine an arbitrary chain of shuffles into a single instruction if
39947/// possible.
39948///
39949/// This is the leaf of the recursive combine below. When we have found some
39950/// chain of single-use x86 shuffle instructions and accumulated the combined
39951/// shuffle mask represented by them, this will try to pattern match that mask
39952/// into either a single instruction if there is a special purpose instruction
39953/// for this operation, or into a PSHUFB instruction which is a fully general
39954/// instruction but should only be used to replace chains over a certain depth.
39956 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39957 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39958 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39959 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39960 const X86Subtarget &Subtarget) {
39961 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39962 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39963 "Unexpected number of shuffle inputs!");
39964 unsigned RootSizeInBits = RootVT.getSizeInBits();
39965 unsigned NumRootElts = RootVT.getVectorNumElements();
39966
39967 // Canonicalize shuffle input op to the requested type.
39968 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39969 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39970 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39971 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39972 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39973 return DAG.getBitcast(VT, Op);
39974 };
39975
39976 // Find the inputs that enter the chain. Note that multiple uses are OK
39977 // here, we're not going to remove the operands we find.
39978 bool UnaryShuffle = (Inputs.size() == 1);
39979 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39980 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39981 : peekThroughBitcasts(Inputs[1]));
39982
39983 MVT VT1 = V1.getSimpleValueType();
39984 MVT VT2 = V2.getSimpleValueType();
39985 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39986 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39987
39988 SDValue Res;
39989
39990 unsigned NumBaseMaskElts = BaseMask.size();
39991 if (NumBaseMaskElts == 1) {
39992 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39993 return CanonicalizeShuffleInput(RootVT, V1);
39994 }
39995
39996 bool OptForSize = DAG.shouldOptForSize();
39997 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39998 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39999 (RootVT.isFloatingPoint() && Depth >= 1) ||
40000 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40001
40002 // If we are shuffling a splat (and not introducing zeros) then we can just
40003 // use it directly. This works for smaller elements as well as they already
40004 // repeat across each mask element.
40005 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40006 V1.getValueSizeInBits() >= RootSizeInBits &&
40007 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40008 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40009 return CanonicalizeShuffleInput(RootVT, V1);
40010 }
40011
40012 SmallVector<int, 64> Mask(BaseMask);
40013
40014 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40015 // etc. can be simplified.
40016 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40017 SmallVector<int> ScaledMask, IdentityMask;
40018 unsigned NumElts = VT1.getVectorNumElements();
40019 if (Mask.size() <= NumElts &&
40020 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40021 for (unsigned i = 0; i != NumElts; ++i)
40022 IdentityMask.push_back(i);
40023 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40024 V2))
40025 return CanonicalizeShuffleInput(RootVT, V1);
40026 }
40027 }
40028
40029 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40030 if (RootVT.is512BitVector() &&
40031 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40032 // If the upper subvectors are zeroable, then an extract+insert is more
40033 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40034 // to zero the upper subvectors.
40035 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40036 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40037 return SDValue(); // Nothing to do!
40038 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40039 "Unexpected lane shuffle");
40040 Res = CanonicalizeShuffleInput(RootVT, V1);
40041 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40042 bool UseZero = isAnyZero(Mask);
40043 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40044 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40045 }
40046
40047 // Narrow shuffle mask to v4x128.
40048 SmallVector<int, 4> ScaledMask;
40049 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40050 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40051
40052 // Try to lower to vshuf64x2/vshuf32x4.
40053 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40054 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40055 SelectionDAG &DAG) {
40056 int PermMask[4] = {-1, -1, -1, -1};
40057 // Ensure elements came from the same Op.
40058 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40059 for (int i = 0; i < 4; ++i) {
40060 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40061 if (ScaledMask[i] < 0)
40062 continue;
40063
40064 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40065 unsigned OpIndex = i / 2;
40066 if (Ops[OpIndex].isUndef())
40067 Ops[OpIndex] = Op;
40068 else if (Ops[OpIndex] != Op)
40069 return SDValue();
40070
40071 PermMask[i] = ScaledMask[i] % 4;
40072 }
40073
40074 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40075 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40076 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40077 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40078 };
40079
40080 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40081 // doesn't work because our mask is for 128 bits and we don't have an MVT
40082 // to match that.
40083 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40084 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40085 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40086 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40087 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40088 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40089 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40090 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40091 ScaledMask[1] == (ScaledMask[3] % 2));
40092
40093 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40094 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40095 return SDValue(); // Nothing to do!
40096 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40097 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40098 return DAG.getBitcast(RootVT, V);
40099 }
40100 }
40101
40102 // Handle 128-bit lane shuffles of 256-bit vectors.
40103 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40104 // If the upper half is zeroable, then an extract+insert is more optimal
40105 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40106 // zero the upper half.
40107 if (isUndefOrZero(Mask[1])) {
40108 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40109 return SDValue(); // Nothing to do!
40110 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40111 Res = CanonicalizeShuffleInput(RootVT, V1);
40112 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40113 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40114 256);
40115 }
40116
40117 // If we're inserting the low subvector, an insert-subvector 'concat'
40118 // pattern is quicker than VPERM2X128.
40119 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40120 !Subtarget.hasAVX2()) {
40121 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40122 return SDValue(); // Nothing to do!
40123 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40124 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40125 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40126 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40127 }
40128
40129 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40130 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40131 // feature.
40132 // Prefer blends for sequential shuffles unless we are optimizing for size.
40133 if (UnaryShuffle &&
40134 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40135 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40136 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40137 return SDValue(); // Nothing to do!
40138 unsigned PermMask = 0;
40139 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40140 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40141 return DAG.getNode(
40142 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40143 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40144 }
40145
40146 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40147 return SDValue(); // Nothing to do!
40148
40149 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40150 if (!UnaryShuffle && !IsMaskedShuffle) {
40151 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40152 "Unexpected shuffle sentinel value");
40153 // Prefer blends to X86ISD::VPERM2X128.
40154 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40155 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40156 return SDValue(); // Nothing to do!
40157 unsigned PermMask = 0;
40158 PermMask |= ((Mask[0] & 3) << 0);
40159 PermMask |= ((Mask[1] & 3) << 4);
40160 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40161 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40162 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40163 CanonicalizeShuffleInput(RootVT, LHS),
40164 CanonicalizeShuffleInput(RootVT, RHS),
40165 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40166 }
40167 }
40168 }
40169
40170 // For masks that have been widened to 128-bit elements or more,
40171 // narrow back down to 64-bit elements.
40172 if (BaseMaskEltSizeInBits > 64) {
40173 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40174 int MaskScale = BaseMaskEltSizeInBits / 64;
40175 SmallVector<int, 64> ScaledMask;
40176 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40177 Mask = std::move(ScaledMask);
40178 }
40179
40180 // For masked shuffles, we're trying to match the root width for better
40181 // writemask folding, attempt to scale the mask.
40182 // TODO - variable shuffles might need this to be widened again.
40183 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40184 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40185 int MaskScale = NumRootElts / Mask.size();
40186 SmallVector<int, 64> ScaledMask;
40187 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40188 Mask = std::move(ScaledMask);
40189 }
40190
40191 unsigned NumMaskElts = Mask.size();
40192 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40193 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40194
40195 // Determine the effective mask value type.
40196 FloatDomain &= (32 <= MaskEltSizeInBits);
40197 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40198 : MVT::getIntegerVT(MaskEltSizeInBits);
40199 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40200
40201 // Only allow legal mask types.
40202 if (!TLI.isTypeLegal(MaskVT))
40203 return SDValue();
40204
40205 // Attempt to match the mask against known shuffle patterns.
40206 MVT ShuffleSrcVT, ShuffleVT;
40207 unsigned Shuffle, PermuteImm;
40208
40209 // Which shuffle domains are permitted?
40210 // Permit domain crossing at higher combine depths.
40211 // TODO: Should we indicate which domain is preferred if both are allowed?
40212 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40213 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40214 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40215
40216 // Determine zeroable mask elements.
40217 APInt KnownUndef, KnownZero;
40218 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40219 APInt Zeroable = KnownUndef | KnownZero;
40220
40221 if (UnaryShuffle) {
40222 // Attempt to match against broadcast-from-vector.
40223 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40224 if ((Subtarget.hasAVX2() ||
40225 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40226 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40227 if (isUndefOrEqual(Mask, 0)) {
40228 if (V1.getValueType() == MaskVT &&
40230 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40231 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40232 return SDValue(); // Nothing to do!
40233 Res = V1.getOperand(0);
40234 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40235 return DAG.getBitcast(RootVT, Res);
40236 }
40237 if (Subtarget.hasAVX2()) {
40238 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40239 return SDValue(); // Nothing to do!
40240 Res = CanonicalizeShuffleInput(MaskVT, V1);
40241 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40242 return DAG.getBitcast(RootVT, Res);
40243 }
40244 }
40245 }
40246
40247 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40248 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40249 (!IsMaskedShuffle ||
40250 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40251 if (Depth == 0 && RootOpc == Shuffle)
40252 return SDValue(); // Nothing to do!
40253 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40254 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40255 return DAG.getBitcast(RootVT, Res);
40256 }
40257
40258 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40259 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40260 PermuteImm) &&
40261 (!IsMaskedShuffle ||
40262 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40263 if (Depth == 0 && RootOpc == Shuffle)
40264 return SDValue(); // Nothing to do!
40265 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40266 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40267 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40268 return DAG.getBitcast(RootVT, Res);
40269 }
40270 }
40271
40272 // Attempt to combine to INSERTPS, but only if the inserted element has come
40273 // from a scalar.
40274 // TODO: Handle other insertions here as well?
40275 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40276 Subtarget.hasSSE41() &&
40277 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40278 if (MaskEltSizeInBits == 32) {
40279 SDValue SrcV1 = V1, SrcV2 = V2;
40280 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40281 DAG) &&
40282 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40283 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40284 return SDValue(); // Nothing to do!
40285 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40286 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40287 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40288 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40289 return DAG.getBitcast(RootVT, Res);
40290 }
40291 }
40292 if (MaskEltSizeInBits == 64 &&
40293 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40295 V2.getScalarValueSizeInBits() <= 32) {
40296 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40297 return SDValue(); // Nothing to do!
40298 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40299 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40300 CanonicalizeShuffleInput(MVT::v4f32, V1),
40301 CanonicalizeShuffleInput(MVT::v4f32, V2),
40302 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40303 return DAG.getBitcast(RootVT, Res);
40304 }
40305 }
40306
40307 SDValue NewV1 = V1; // Save operands in case early exit happens.
40308 SDValue NewV2 = V2;
40309 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40310 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40311 ShuffleVT, UnaryShuffle) &&
40312 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40313 if (Depth == 0 && RootOpc == Shuffle)
40314 return SDValue(); // Nothing to do!
40315 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40316 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40317 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40318 return DAG.getBitcast(RootVT, Res);
40319 }
40320
40321 NewV1 = V1; // Save operands in case early exit happens.
40322 NewV2 = V2;
40323 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40324 AllowIntDomain, NewV1, NewV2, DL, DAG,
40325 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40326 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40327 if (Depth == 0 && RootOpc == Shuffle)
40328 return SDValue(); // Nothing to do!
40329 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40330 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40331 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40332 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40333 return DAG.getBitcast(RootVT, Res);
40334 }
40335
40336 // Typically from here on, we need an integer version of MaskVT.
40337 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40338 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40339
40340 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40341 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40342 uint64_t BitLen, BitIdx;
40343 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40344 Zeroable)) {
40345 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40346 return SDValue(); // Nothing to do!
40347 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40348 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40349 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40350 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40351 return DAG.getBitcast(RootVT, Res);
40352 }
40353
40354 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40355 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40356 return SDValue(); // Nothing to do!
40357 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40358 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40359 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40360 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40361 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40362 return DAG.getBitcast(RootVT, Res);
40363 }
40364 }
40365
40366 // Match shuffle against TRUNCATE patterns.
40367 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40368 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40369 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40370 Subtarget)) {
40371 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40372 ShuffleSrcVT.getVectorNumElements();
40373 unsigned Opc =
40375 if (Depth == 0 && RootOpc == Opc)
40376 return SDValue(); // Nothing to do!
40377 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40378 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40379 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40380 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40381 return DAG.getBitcast(RootVT, Res);
40382 }
40383
40384 // Do we need a more general binary truncation pattern?
40385 if (RootSizeInBits < 512 &&
40386 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40387 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40388 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40389 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40390 // Bail if this was already a truncation or PACK node.
40391 // We sometimes fail to match PACK if we demand known undef elements.
40392 if (Depth == 0 &&
40393 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40394 RootOpc == X86ISD::PACKUS))
40395 return SDValue(); // Nothing to do!
40396 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40397 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40398 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40399 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40400 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40401 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40402 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40403 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40404 return DAG.getBitcast(RootVT, Res);
40405 }
40406 }
40407
40408 // Don't try to re-form single instruction chains under any circumstances now
40409 // that we've done encoding canonicalization for them.
40410 if (Depth < 1)
40411 return SDValue();
40412
40413 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40414 return isTargetShuffleVariableMask(N->getOpcode());
40415 });
40416 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40417 return (N->getOpcode() == X86ISD::VPERMV3 ||
40418 N->getOpcode() == X86ISD::VPERMV);
40419 });
40420
40421 // Depth threshold above which we can efficiently use variable mask shuffles.
40422 int VariableCrossLaneShuffleDepth =
40423 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40424 int VariablePerLaneShuffleDepth =
40425 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40426 AllowVariableCrossLaneMask &=
40427 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40428 AllowVariablePerLaneMask &=
40429 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40430 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40431 // higher depth before combining them.
40432 int BWIVPERMV3ShuffleDepth =
40433 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40434 bool AllowBWIVPERMV3 =
40435 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40436
40437 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40438 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40439 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40440
40441 bool MaskContainsZeros = isAnyZero(Mask);
40442
40443 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40444 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40445 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40446 if (Subtarget.hasAVX2() &&
40447 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40448 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40449 Res = CanonicalizeShuffleInput(MaskVT, V1);
40450 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40451 return DAG.getBitcast(RootVT, Res);
40452 }
40453 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40454 if ((Subtarget.hasAVX512() &&
40455 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40456 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40457 (Subtarget.hasBWI() &&
40458 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40459 (Subtarget.hasVBMI() &&
40460 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40461 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40462 V2 = DAG.getUNDEF(MaskVT);
40463 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40464 return DAG.getBitcast(RootVT, Res);
40465 }
40466 }
40467
40468 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40469 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40470 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40471 ((Subtarget.hasAVX512() &&
40472 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40473 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40474 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40475 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40476 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40477 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40478 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40479 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40480 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40481 for (unsigned i = 0; i != NumMaskElts; ++i)
40482 if (Mask[i] == SM_SentinelZero)
40483 Mask[i] = NumMaskElts + i;
40484 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40485 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40486 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40487 return DAG.getBitcast(RootVT, Res);
40488 }
40489
40490 // If that failed and either input is extracted then try to combine as a
40491 // shuffle with the larger type.
40493 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40494 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40495 IsMaskedShuffle, DAG, DL, Subtarget))
40496 return WideShuffle;
40497
40498 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40499 // (non-VLX will pad to 512-bit shuffles).
40500 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40501 ((Subtarget.hasAVX512() &&
40502 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40503 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40504 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40505 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40506 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40507 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40508 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40509 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40510 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40511 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40512 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40513 return DAG.getBitcast(RootVT, Res);
40514 }
40515 return SDValue();
40516 }
40517
40518 // See if we can combine a single input shuffle with zeros to a bit-mask,
40519 // which is much simpler than any shuffle.
40520 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40521 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40522 TLI.isTypeLegal(MaskVT)) {
40523 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40524 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40525 APInt UndefElts(NumMaskElts, 0);
40526 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40527 for (unsigned i = 0; i != NumMaskElts; ++i) {
40528 int M = Mask[i];
40529 if (M == SM_SentinelUndef) {
40530 UndefElts.setBit(i);
40531 continue;
40532 }
40533 if (M == SM_SentinelZero)
40534 continue;
40535 EltBits[i] = AllOnes;
40536 }
40537 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40538 Res = CanonicalizeShuffleInput(MaskVT, V1);
40539 unsigned AndOpcode =
40541 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40542 return DAG.getBitcast(RootVT, Res);
40543 }
40544
40545 // If we have a single input shuffle with different shuffle patterns in the
40546 // the 128-bit lanes use the variable mask to VPERMILPS.
40547 // TODO Combine other mask types at higher depths.
40548 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40549 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40550 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40551 SmallVector<SDValue, 16> VPermIdx;
40552 for (int M : Mask) {
40553 SDValue Idx =
40554 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40555 VPermIdx.push_back(Idx);
40556 }
40557 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40558 Res = CanonicalizeShuffleInput(MaskVT, V1);
40559 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40560 return DAG.getBitcast(RootVT, Res);
40561 }
40562
40563 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40564 // to VPERMIL2PD/VPERMIL2PS.
40565 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40566 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40567 MaskVT == MVT::v8f32)) {
40568 // VPERMIL2 Operation.
40569 // Bits[3] - Match Bit.
40570 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40571 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40572 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40573 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40574 SmallVector<int, 8> VPerm2Idx;
40575 unsigned M2ZImm = 0;
40576 for (int M : Mask) {
40577 if (M == SM_SentinelUndef) {
40578 VPerm2Idx.push_back(-1);
40579 continue;
40580 }
40581 if (M == SM_SentinelZero) {
40582 M2ZImm = 2;
40583 VPerm2Idx.push_back(8);
40584 continue;
40585 }
40586 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40587 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40588 VPerm2Idx.push_back(Index);
40589 }
40590 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40591 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40592 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40593 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40594 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40595 return DAG.getBitcast(RootVT, Res);
40596 }
40597
40598 // If we have 3 or more shuffle instructions or a chain involving a variable
40599 // mask, we can replace them with a single PSHUFB instruction profitably.
40600 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40601 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40602 // more aggressive.
40603 if (UnaryShuffle && AllowVariablePerLaneMask &&
40604 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40605 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40606 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40607 SmallVector<SDValue, 16> PSHUFBMask;
40608 int NumBytes = RootVT.getSizeInBits() / 8;
40609 int Ratio = NumBytes / NumMaskElts;
40610 for (int i = 0; i < NumBytes; ++i) {
40611 int M = Mask[i / Ratio];
40612 if (M == SM_SentinelUndef) {
40613 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40614 continue;
40615 }
40616 if (M == SM_SentinelZero) {
40617 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40618 continue;
40619 }
40620 M = Ratio * M + i % Ratio;
40621 assert((M / 16) == (i / 16) && "Lane crossing detected");
40622 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40623 }
40624 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40625 Res = CanonicalizeShuffleInput(ByteVT, V1);
40626 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40627 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40628 return DAG.getBitcast(RootVT, Res);
40629 }
40630
40631 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40632 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40633 // slower than PSHUFB on targets that support both.
40634 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40635 Subtarget.hasXOP()) {
40636 // VPPERM Mask Operation
40637 // Bits[4:0] - Byte Index (0 - 31)
40638 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40639 SmallVector<SDValue, 16> VPPERMMask;
40640 int NumBytes = 16;
40641 int Ratio = NumBytes / NumMaskElts;
40642 for (int i = 0; i < NumBytes; ++i) {
40643 int M = Mask[i / Ratio];
40644 if (M == SM_SentinelUndef) {
40645 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40646 continue;
40647 }
40648 if (M == SM_SentinelZero) {
40649 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40650 continue;
40651 }
40652 M = Ratio * M + i % Ratio;
40653 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40654 }
40655 MVT ByteVT = MVT::v16i8;
40656 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40657 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40658 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40659 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40660 return DAG.getBitcast(RootVT, Res);
40661 }
40662
40663 // If that failed and either input is extracted then try to combine as a
40664 // shuffle with the larger type.
40666 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40667 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40668 DAG, DL, Subtarget))
40669 return WideShuffle;
40670
40671 // If we have a dual input shuffle then lower to VPERMV3,
40672 // (non-VLX will pad to 512-bit shuffles)
40673 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40674 ((Subtarget.hasAVX512() &&
40675 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40676 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40677 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40678 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40679 MaskVT == MVT::v16i32)) ||
40680 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40681 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40682 MaskVT == MVT::v32i16)) ||
40683 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40684 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40685 MaskVT == MVT::v64i8)))) {
40686 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40687 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40688 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40689 return DAG.getBitcast(RootVT, Res);
40690 }
40691
40692 // Failed to find any combines.
40693 return SDValue();
40694}
40695
40696// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40697// instruction if possible.
40698//
40699// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40700// type size to attempt to combine:
40701// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40702// -->
40703// extract_subvector(shuffle(x,y,m2),0)
40705 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40706 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40707 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40708 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40709 const X86Subtarget &Subtarget) {
40710 unsigned NumMaskElts = BaseMask.size();
40711 unsigned NumInputs = Inputs.size();
40712 if (NumInputs == 0)
40713 return SDValue();
40714
40715 unsigned RootSizeInBits = RootVT.getSizeInBits();
40716 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40717 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40718
40719 // Peek through subvectors to find widest legal vector.
40720 // TODO: Handle ISD::TRUNCATE
40721 unsigned WideSizeInBits = RootSizeInBits;
40722 for (SDValue Input : Inputs) {
40724 while (1) {
40725 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40726 Input = peekThroughBitcasts(Input.getOperand(0));
40727 continue;
40728 }
40729 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40730 Input.getOperand(0).isUndef() &&
40731 isNullConstant(Input.getOperand(2))) {
40732 Input = peekThroughBitcasts(Input.getOperand(1));
40733 continue;
40734 }
40735 break;
40736 }
40737 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40738 WideSizeInBits < Input.getValueSizeInBits())
40739 WideSizeInBits = Input.getValueSizeInBits();
40740 }
40741
40742 // Bail if we fail to find a source larger than the existing root.
40743 if (WideSizeInBits <= RootSizeInBits ||
40744 (WideSizeInBits % RootSizeInBits) != 0)
40745 return SDValue();
40746
40747 // Create new mask for larger type.
40748 SmallVector<int, 64> WideMask;
40749 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40750
40751 // Attempt to peek through inputs and adjust mask when we extract from an
40752 // upper subvector.
40753 int AdjustedMasks = 0;
40754 SmallVector<SDValue, 4> WideInputs(Inputs);
40755 for (unsigned I = 0; I != NumInputs; ++I) {
40756 SDValue &Input = WideInputs[I];
40758 while (1) {
40759 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40760 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40761 uint64_t Idx = Input.getConstantOperandVal(1);
40762 if (Idx != 0) {
40763 ++AdjustedMasks;
40764 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40765 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40766
40767 int lo = I * WideMask.size();
40768 int hi = (I + 1) * WideMask.size();
40769 for (int &M : WideMask)
40770 if (lo <= M && M < hi)
40771 M += Idx;
40772 }
40773 Input = peekThroughBitcasts(Input.getOperand(0));
40774 continue;
40775 }
40776 // TODO: Handle insertions into upper subvectors.
40777 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40778 Input.getOperand(0).isUndef() &&
40779 isNullConstant(Input.getOperand(2))) {
40780 Input = peekThroughBitcasts(Input.getOperand(1));
40781 continue;
40782 }
40783 break;
40784 }
40785 }
40786
40787 // Remove unused/repeated shuffle source ops.
40788 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40789 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40790
40791 // Bail if we're always extracting from the lowest subvectors,
40792 // combineX86ShuffleChain should match this for the current width, or the
40793 // shuffle still references too many inputs.
40794 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40795 return SDValue();
40796
40797 // Minor canonicalization of the accumulated shuffle mask to make it easier
40798 // to match below. All this does is detect masks with sequential pairs of
40799 // elements, and shrink them to the half-width mask. It does this in a loop
40800 // so it will reduce the size of the mask to the minimal width mask which
40801 // performs an equivalent shuffle.
40802 while (WideMask.size() > 1) {
40803 SmallVector<int, 64> WidenedMask;
40804 if (!canWidenShuffleElements(WideMask, WidenedMask))
40805 break;
40806 WideMask = std::move(WidenedMask);
40807 }
40808
40809 // Canonicalization of binary shuffle masks to improve pattern matching by
40810 // commuting the inputs.
40811 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40813 std::swap(WideInputs[0], WideInputs[1]);
40814 }
40815
40816 // Increase depth for every upper subvector we've peeked through.
40817 Depth += AdjustedMasks;
40818
40819 // Attempt to combine wider chain.
40820 // TODO: Can we use a better Root?
40821 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40822 WideInputs.back().getValueSizeInBits()
40823 ? WideInputs.front()
40824 : WideInputs.back();
40825 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40826 "WideRootSize mismatch");
40827
40828 if (SDValue WideShuffle = combineX86ShuffleChain(
40829 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40830 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40831 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40832 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40833 return DAG.getBitcast(RootVT, WideShuffle);
40834 }
40835
40836 return SDValue();
40837}
40838
40839// Canonicalize the combined shuffle mask chain with horizontal ops.
40840// NOTE: This may update the Ops and Mask.
40843 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40844 const X86Subtarget &Subtarget) {
40845 if (Mask.empty() || Ops.empty())
40846 return SDValue();
40847
40849 for (SDValue Op : Ops)
40851
40852 // All ops must be the same horizop + type.
40853 SDValue BC0 = BC[0];
40854 EVT VT0 = BC0.getValueType();
40855 unsigned Opcode0 = BC0.getOpcode();
40856 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40857 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40858 }))
40859 return SDValue();
40860
40861 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40862 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40863 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40864 if (!isHoriz && !isPack)
40865 return SDValue();
40866
40867 // Do all ops have a single use?
40868 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40869 return Op.hasOneUse() &&
40871 });
40872
40873 int NumElts = VT0.getVectorNumElements();
40874 int NumLanes = VT0.getSizeInBits() / 128;
40875 int NumEltsPerLane = NumElts / NumLanes;
40876 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40877 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40878 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40879
40880 if (NumEltsPerLane >= 4 &&
40881 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40882 SmallVector<int> LaneMask, ScaledMask;
40883 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40884 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40885 // See if we can remove the shuffle by resorting the HOP chain so that
40886 // the HOP args are pre-shuffled.
40887 // TODO: Generalize to any sized/depth chain.
40888 // TODO: Add support for PACKSS/PACKUS.
40889 if (isHoriz) {
40890 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40891 auto GetHOpSrc = [&](int M) {
40892 if (M == SM_SentinelUndef)
40893 return DAG.getUNDEF(VT0);
40894 if (M == SM_SentinelZero)
40895 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40896 SDValue Src0 = BC[M / 4];
40897 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40898 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40899 return Src1.getOperand(M % 2);
40900 return SDValue();
40901 };
40902 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40903 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40904 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40905 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40906 if (M0 && M1 && M2 && M3) {
40907 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40908 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40909 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40910 }
40911 }
40912 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40913 if (Ops.size() >= 2) {
40914 SDValue LHS, RHS;
40915 auto GetHOpSrc = [&](int M, int &OutM) {
40916 // TODO: Support SM_SentinelZero
40917 if (M < 0)
40918 return M == SM_SentinelUndef;
40919 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40920 if (!LHS || LHS == Src) {
40921 LHS = Src;
40922 OutM = (M % 2);
40923 return true;
40924 }
40925 if (!RHS || RHS == Src) {
40926 RHS = Src;
40927 OutM = (M % 2) + 2;
40928 return true;
40929 }
40930 return false;
40931 };
40932 int PostMask[4] = {-1, -1, -1, -1};
40933 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40934 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40935 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40936 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40937 LHS = DAG.getBitcast(SrcVT, LHS);
40938 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40939 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40940 // Use SHUFPS for the permute so this will work on SSE2 targets,
40941 // shuffle combining and domain handling will simplify this later on.
40942 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40943 Res = DAG.getBitcast(ShuffleVT, Res);
40944 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40945 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40946 }
40947 }
40948 }
40949 }
40950
40951 if (2 < Ops.size())
40952 return SDValue();
40953
40954 SDValue BC1 = BC[BC.size() - 1];
40955 if (Mask.size() == VT0.getVectorNumElements()) {
40956 // Canonicalize binary shuffles of horizontal ops that use the
40957 // same sources to an unary shuffle.
40958 // TODO: Try to perform this fold even if the shuffle remains.
40959 if (Ops.size() == 2) {
40960 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40961 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40962 };
40963 // Commute if all BC0's ops are contained in BC1.
40964 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40965 ContainsOps(BC1, BC0.getOperand(1))) {
40967 std::swap(Ops[0], Ops[1]);
40968 std::swap(BC0, BC1);
40969 }
40970
40971 // If BC1 can be represented by BC0, then convert to unary shuffle.
40972 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40973 ContainsOps(BC0, BC1.getOperand(1))) {
40974 for (int &M : Mask) {
40975 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40976 continue;
40977 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40978 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40979 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40980 M += NumHalfEltsPerLane;
40981 }
40982 }
40983 }
40984
40985 // Canonicalize unary horizontal ops to only refer to lower halves.
40986 for (int i = 0; i != NumElts; ++i) {
40987 int &M = Mask[i];
40988 if (isUndefOrZero(M))
40989 continue;
40990 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40991 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40992 M -= NumHalfEltsPerLane;
40993 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40994 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40995 M -= NumHalfEltsPerLane;
40996 }
40997 }
40998
40999 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41000 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41001 // represents the LHS/RHS inputs for the lower/upper halves.
41002 SmallVector<int, 16> TargetMask128, WideMask128;
41003 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41004 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41005 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41006 bool SingleOp = (Ops.size() == 1);
41007 if (isPack || OneUseOps ||
41008 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41009 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41010 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41011 Lo = Lo.getOperand(WideMask128[0] & 1);
41012 Hi = Hi.getOperand(WideMask128[1] & 1);
41013 if (SingleOp) {
41014 SDValue Undef = DAG.getUNDEF(SrcVT);
41015 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41016 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41017 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41018 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41019 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41020 }
41021 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41022 }
41023 }
41024
41025 // If we are post-shuffling a 256-bit hop and not requiring the upper
41026 // elements, then try to narrow to a 128-bit hop directly.
41027 SmallVector<int, 16> WideMask64;
41028 if (Ops.size() == 1 && NumLanes == 2 &&
41029 scaleShuffleElements(Mask, 4, WideMask64) &&
41030 isUndefInRange(WideMask64, 2, 2)) {
41031 int M0 = WideMask64[0];
41032 int M1 = WideMask64[1];
41033 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41035 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41036 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41037 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41038 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41039 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41040 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41041 }
41042 }
41043
41044 return SDValue();
41045}
41046
41047// Attempt to constant fold all of the constant source ops.
41048// Returns true if the entire shuffle is folded to a constant.
41049// TODO: Extend this to merge multiple constant Ops and update the mask.
41051 ArrayRef<int> Mask,
41052 ArrayRef<const SDNode *> SrcNodes,
41053 SelectionDAG &DAG, const SDLoc &DL,
41054 const X86Subtarget &Subtarget) {
41055 unsigned SizeInBits = VT.getSizeInBits();
41056 unsigned NumMaskElts = Mask.size();
41057 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41058 unsigned NumOps = Ops.size();
41059
41060 // Extract constant bits from each source op.
41061 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41063 for (unsigned I = 0; I != NumOps; ++I)
41064 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41065 RawBitsOps[I],
41066 /*AllowWholeUndefs*/ true,
41067 /*AllowPartialUndefs*/ true))
41068 return SDValue();
41069
41070 // If we're optimizing for size, only fold if at least one of the constants is
41071 // only used once or the combined shuffle has included a variable mask
41072 // shuffle, this is to avoid constant pool bloat.
41073 bool IsOptimizingSize = DAG.shouldOptForSize();
41074 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41075 return isTargetShuffleVariableMask(N->getOpcode());
41076 });
41077 if (IsOptimizingSize && !HasVariableMask &&
41078 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41079 return SDValue();
41080
41081 // Shuffle the constant bits according to the mask.
41082 APInt UndefElts(NumMaskElts, 0);
41083 APInt ZeroElts(NumMaskElts, 0);
41084 APInt ConstantElts(NumMaskElts, 0);
41085 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41086 APInt::getZero(MaskSizeInBits));
41087 for (unsigned i = 0; i != NumMaskElts; ++i) {
41088 int M = Mask[i];
41089 if (M == SM_SentinelUndef) {
41090 UndefElts.setBit(i);
41091 continue;
41092 } else if (M == SM_SentinelZero) {
41093 ZeroElts.setBit(i);
41094 continue;
41095 }
41096 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41097
41098 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41099 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41100
41101 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41102 if (SrcUndefElts[SrcMaskIdx]) {
41103 UndefElts.setBit(i);
41104 continue;
41105 }
41106
41107 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41108 APInt &Bits = SrcEltBits[SrcMaskIdx];
41109 if (!Bits) {
41110 ZeroElts.setBit(i);
41111 continue;
41112 }
41113
41114 ConstantElts.setBit(i);
41115 ConstantBitData[i] = Bits;
41116 }
41117 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41118
41119 // Attempt to create a zero vector.
41120 if ((UndefElts | ZeroElts).isAllOnes())
41121 return getZeroVector(VT, Subtarget, DAG, DL);
41122
41123 // Create the constant data.
41124 MVT MaskSVT;
41125 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41126 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41127 else
41128 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41129
41130 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41131 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41132 return SDValue();
41133
41134 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41135 return DAG.getBitcast(VT, CstOp);
41136}
41137
41138namespace llvm {
41139 namespace X86 {
41140 enum {
41142 };
41143 } // namespace X86
41144} // namespace llvm
41145
41146/// Fully generic combining of x86 shuffle instructions.
41147///
41148/// This should be the last combine run over the x86 shuffle instructions. Once
41149/// they have been fully optimized, this will recursively consider all chains
41150/// of single-use shuffle instructions, build a generic model of the cumulative
41151/// shuffle operation, and check for simpler instructions which implement this
41152/// operation. We use this primarily for two purposes:
41153///
41154/// 1) Collapse generic shuffles to specialized single instructions when
41155/// equivalent. In most cases, this is just an encoding size win, but
41156/// sometimes we will collapse multiple generic shuffles into a single
41157/// special-purpose shuffle.
41158/// 2) Look for sequences of shuffle instructions with 3 or more total
41159/// instructions, and replace them with the slightly more expensive SSSE3
41160/// PSHUFB instruction if available. We do this as the last combining step
41161/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41162/// a suitable short sequence of other instructions. The PSHUFB will either
41163/// use a register or have to read from memory and so is slightly (but only
41164/// slightly) more expensive than the other shuffle instructions.
41165///
41166/// Because this is inherently a quadratic operation (for each shuffle in
41167/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41168/// This should never be an issue in practice as the shuffle lowering doesn't
41169/// produce sequences of more than 8 instructions.
41170///
41171/// FIXME: We will currently miss some cases where the redundant shuffling
41172/// would simplify under the threshold for PSHUFB formation because of
41173/// combine-ordering. To fix this, we should do the redundant instruction
41174/// combining in this recursive walk.
41176 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41177 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41178 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41179 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41180 const SDLoc &DL, const X86Subtarget &Subtarget) {
41181 assert(!RootMask.empty() &&
41182 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41183 "Illegal shuffle root mask");
41184 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41185 unsigned RootSizeInBits = RootVT.getSizeInBits();
41186 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41187
41188 // Bound the depth of our recursive combine because this is ultimately
41189 // quadratic in nature.
41190 if (Depth >= MaxDepth)
41191 return SDValue();
41192
41193 // Directly rip through bitcasts to find the underlying operand.
41194 SDValue Op = SrcOps[SrcOpIndex];
41196
41197 EVT VT = Op.getValueType();
41198 if (!VT.isVector() || !VT.isSimple())
41199 return SDValue(); // Bail if we hit a non-simple non-vector.
41200
41201 // FIXME: Just bail on f16 for now.
41202 if (VT.getVectorElementType() == MVT::f16)
41203 return SDValue();
41204
41205 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41206 "Can only combine shuffles upto size of the root op.");
41207
41208 // Create a demanded elts mask from the referenced elements of Op.
41209 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41210 for (int M : RootMask) {
41211 int BaseIdx = RootMask.size() * SrcOpIndex;
41212 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41213 OpDemandedElts.setBit(M - BaseIdx);
41214 }
41215 if (RootSizeInBits != VT.getSizeInBits()) {
41216 // Op is smaller than Root - extract the demanded elts for the subvector.
41217 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41218 unsigned NumOpMaskElts = RootMask.size() / Scale;
41219 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41220 assert(OpDemandedElts
41221 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41222 .isZero() &&
41223 "Out of range elements referenced in root mask");
41224 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41225 }
41226 OpDemandedElts =
41227 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41228
41229 // Extract target shuffle mask and resolve sentinels and inputs.
41230 SmallVector<int, 64> OpMask;
41231 SmallVector<SDValue, 2> OpInputs;
41232 APInt OpUndef, OpZero;
41233 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41234 OpZero, DAG, Depth, false)) {
41235 // Shuffle inputs must not be larger than the shuffle result.
41236 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41237 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41238 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41239 }))
41240 return SDValue();
41241 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41242 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41243 !isNullConstant(Op.getOperand(1))) {
41244 SDValue SrcVec = Op.getOperand(0);
41245 int ExtractIdx = Op.getConstantOperandVal(1);
41246 unsigned NumElts = VT.getVectorNumElements();
41247 OpInputs.assign({SrcVec});
41248 OpMask.assign(NumElts, SM_SentinelUndef);
41249 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41250 OpZero = OpUndef = APInt::getZero(NumElts);
41251 } else {
41252 return SDValue();
41253 }
41254
41255 // If the shuffle result was smaller than the root, we need to adjust the
41256 // mask indices and pad the mask with undefs.
41257 if (RootSizeInBits > VT.getSizeInBits()) {
41258 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41259 unsigned OpMaskSize = OpMask.size();
41260 if (OpInputs.size() > 1) {
41261 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41262 for (int &M : OpMask) {
41263 if (M < 0)
41264 continue;
41265 int EltIdx = M % OpMaskSize;
41266 int OpIdx = M / OpMaskSize;
41267 M = (PaddedMaskSize * OpIdx) + EltIdx;
41268 }
41269 }
41270 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41271 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41272 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41273 }
41274
41277
41278 // We don't need to merge masks if the root is empty.
41279 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41280 if (EmptyRoot) {
41281 // Only resolve zeros if it will remove an input, otherwise we might end
41282 // up in an infinite loop.
41283 bool ResolveKnownZeros = true;
41284 if (!OpZero.isZero()) {
41285 APInt UsedInputs = APInt::getZero(OpInputs.size());
41286 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41287 int M = OpMask[i];
41288 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41289 continue;
41290 UsedInputs.setBit(M / OpMask.size());
41291 if (UsedInputs.isAllOnes()) {
41292 ResolveKnownZeros = false;
41293 break;
41294 }
41295 }
41296 }
41297 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41298 ResolveKnownZeros);
41299
41300 Mask = OpMask;
41301 Ops.append(OpInputs.begin(), OpInputs.end());
41302 } else {
41303 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41304
41305 // Add the inputs to the Ops list, avoiding duplicates.
41306 Ops.append(SrcOps.begin(), SrcOps.end());
41307
41308 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41309 // Attempt to find an existing match.
41311 for (int i = 0, e = Ops.size(); i < e; ++i)
41312 if (InputBC == peekThroughBitcasts(Ops[i]))
41313 return i;
41314 // Match failed - should we replace an existing Op?
41315 if (InsertionPoint >= 0) {
41317 return InsertionPoint;
41318 }
41319 // Add to the end of the Ops list.
41320 Ops.push_back(Input);
41321 return Ops.size() - 1;
41322 };
41323
41324 SmallVector<int, 2> OpInputIdx;
41325 for (SDValue OpInput : OpInputs)
41326 OpInputIdx.push_back(
41327 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41328
41329 assert(((RootMask.size() > OpMask.size() &&
41330 RootMask.size() % OpMask.size() == 0) ||
41331 (OpMask.size() > RootMask.size() &&
41332 OpMask.size() % RootMask.size() == 0) ||
41333 OpMask.size() == RootMask.size()) &&
41334 "The smaller number of elements must divide the larger.");
41335
41336 // This function can be performance-critical, so we rely on the power-of-2
41337 // knowledge that we have about the mask sizes to replace div/rem ops with
41338 // bit-masks and shifts.
41340 "Non-power-of-2 shuffle mask sizes");
41342 "Non-power-of-2 shuffle mask sizes");
41343 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41344 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41345
41346 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41347 unsigned RootRatio =
41348 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41349 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41350 assert((RootRatio == 1 || OpRatio == 1) &&
41351 "Must not have a ratio for both incoming and op masks!");
41352
41353 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41354 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41355 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41356 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41357 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41358
41359 Mask.resize(MaskWidth, SM_SentinelUndef);
41360
41361 // Merge this shuffle operation's mask into our accumulated mask. Note that
41362 // this shuffle's mask will be the first applied to the input, followed by
41363 // the root mask to get us all the way to the root value arrangement. The
41364 // reason for this order is that we are recursing up the operation chain.
41365 for (unsigned i = 0; i < MaskWidth; ++i) {
41366 unsigned RootIdx = i >> RootRatioLog2;
41367 if (RootMask[RootIdx] < 0) {
41368 // This is a zero or undef lane, we're done.
41369 Mask[i] = RootMask[RootIdx];
41370 continue;
41371 }
41372
41373 unsigned RootMaskedIdx =
41374 RootRatio == 1
41375 ? RootMask[RootIdx]
41376 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41377
41378 // Just insert the scaled root mask value if it references an input other
41379 // than the SrcOp we're currently inserting.
41380 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41381 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41382 Mask[i] = RootMaskedIdx;
41383 continue;
41384 }
41385
41386 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41387 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41388 if (OpMask[OpIdx] < 0) {
41389 // The incoming lanes are zero or undef, it doesn't matter which ones we
41390 // are using.
41391 Mask[i] = OpMask[OpIdx];
41392 continue;
41393 }
41394
41395 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41396 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41397 : (OpMask[OpIdx] << OpRatioLog2) +
41398 (RootMaskedIdx & (OpRatio - 1));
41399
41400 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41401 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41402 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41403 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41404
41405 Mask[i] = OpMaskedIdx;
41406 }
41407 }
41408
41409 // Peek through any free bitcasts to insert_subvector vector widenings or
41410 // extract_subvector nodes back to root size.
41411 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41412 for (auto [I, Op] : enumerate(Ops)) {
41413 SDValue BC = Op;
41414 while (1) {
41415 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41416 BC = BC.getOperand(0);
41417 continue;
41418 }
41419 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41420 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41421 // Set out of bounds mask indices to undef.
41422 Op = BC = BC.getOperand(1);
41423 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41424 int Lo = I * Mask.size();
41425 int Hi = (I + 1) * Mask.size();
41426 int NewHi = Lo + (Mask.size() / Scale);
41427 for (int &M : Mask) {
41428 if (Lo <= M && NewHi <= M && M < Hi)
41429 M = SM_SentinelUndef;
41430 }
41431 continue;
41432 }
41433 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41434 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41435 isNullConstant(BC.getOperand(1))) {
41436 Op = BC = BC.getOperand(0);
41437 continue;
41438 }
41439 break;
41440 }
41441 }
41442
41443 // Remove unused/repeated shuffle source ops.
41445
41446 // Handle the all undef/zero/ones cases early.
41447 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41448 return DAG.getUNDEF(RootVT);
41449 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41450 return getZeroVector(RootVT, Subtarget, DAG, DL);
41451 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41453 return getOnesVector(RootVT, DAG, DL);
41454
41455 assert(!Ops.empty() && "Shuffle with no inputs detected");
41456
41457 // Update the list of shuffle nodes that have been combined so far.
41458 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41459 CombinedNodes.push_back(Op.getNode());
41460
41461 // See if we can recurse into each shuffle source op (if it's a target
41462 // shuffle). The source op should only be generally combined if it either has
41463 // a single use (i.e. current Op) or all its users have already been combined,
41464 // if not then we can still combine but should prevent generation of variable
41465 // shuffles to avoid constant pool bloat.
41466 // Don't recurse if we already have more source ops than we can combine in
41467 // the remaining recursion depth.
41468 if (Ops.size() < (MaxDepth - Depth)) {
41469 for (int i = 0, e = Ops.size(); i < e; ++i) {
41470 // For empty roots, we need to resolve zeroable elements before combining
41471 // them with other shuffles.
41472 SmallVector<int, 64> ResolvedMask = Mask;
41473 if (EmptyRoot)
41474 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41475 bool AllowCrossLaneVar = false;
41476 bool AllowPerLaneVar = false;
41477 if (Ops[i].getNode()->hasOneUse() ||
41478 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41479 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41480 AllowPerLaneVar = AllowVariablePerLaneMask;
41481 }
41483 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41484 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41485 DAG, DL, Subtarget))
41486 return Res;
41487 }
41488 }
41489
41490 // Attempt to constant fold all of the constant source ops.
41492 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41493 return Cst;
41494
41495 // If constant fold failed and we only have constants - then we have
41496 // multiple uses by a single non-variable shuffle - just bail.
41497 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41498 APInt UndefElts;
41499 SmallVector<APInt> RawBits;
41500 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41501 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41502 RawBits,
41503 /*AllowWholeUndefs*/ true,
41504 /*AllowPartialUndefs*/ true);
41505 })) {
41506 return SDValue();
41507 }
41508
41509 // Canonicalize the combined shuffle mask chain with horizontal ops.
41510 // NOTE: This will update the Ops and Mask.
41512 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41513 return DAG.getBitcast(RootVT, HOp);
41514
41515 // Try to refine our inputs given our knowledge of target shuffle mask.
41516 for (auto I : enumerate(Ops)) {
41517 int OpIdx = I.index();
41518 SDValue &Op = I.value();
41519
41520 // What range of shuffle mask element values results in picking from Op?
41521 int Lo = OpIdx * Mask.size();
41522 int Hi = Lo + Mask.size();
41523
41524 // Which elements of Op do we demand, given the mask's granularity?
41525 APInt OpDemandedElts(Mask.size(), 0);
41526 for (int MaskElt : Mask) {
41527 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41528 int OpEltIdx = MaskElt - Lo;
41529 OpDemandedElts.setBit(OpEltIdx);
41530 }
41531 }
41532
41533 // Is the shuffle result smaller than the root?
41534 if (Op.getValueSizeInBits() < RootSizeInBits) {
41535 // We padded the mask with undefs. But we now need to undo that.
41536 unsigned NumExpectedVectorElts = Mask.size();
41537 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41538 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41539 assert(!OpDemandedElts.extractBits(
41540 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41541 "Demanding the virtual undef widening padding?");
41542 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41543 }
41544
41545 // The Op itself may be of different VT, so we need to scale the mask.
41546 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41547 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41548
41549 // Can this operand be simplified any further, given it's demanded elements?
41551 Op, OpScaledDemandedElts, DAG))
41552 Op = NewOp;
41553 }
41554 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41555
41556 // Widen any subvector shuffle inputs we've collected.
41557 // TODO: Remove this to avoid generating temporary nodes, we should only
41558 // widen once combineX86ShuffleChain has found a match.
41559 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41560 return Op.getValueSizeInBits() < RootSizeInBits;
41561 })) {
41562 for (SDValue &Op : Ops)
41563 if (Op.getValueSizeInBits() < RootSizeInBits)
41564 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41565 RootSizeInBits);
41566 // Reresolve - we might have repeated subvector sources.
41568 }
41569
41570 // Handle the all undef/zero/ones cases.
41571 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41572 return DAG.getUNDEF(RootVT);
41573 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41574 return getZeroVector(RootVT, Subtarget, DAG, DL);
41575 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41577 return getOnesVector(RootVT, DAG, DL);
41578
41579 assert(!Ops.empty() && "Shuffle with no inputs detected");
41580
41581 // We can only combine unary and binary shuffle mask cases.
41582 if (Ops.size() <= 2) {
41583 // Minor canonicalization of the accumulated shuffle mask to make it easier
41584 // to match below. All this does is detect masks with sequential pairs of
41585 // elements, and shrink them to the half-width mask. It does this in a loop
41586 // so it will reduce the size of the mask to the minimal width mask which
41587 // performs an equivalent shuffle.
41588 while (Mask.size() > 1) {
41589 SmallVector<int, 64> WidenedMask;
41590 if (!canWidenShuffleElements(Mask, WidenedMask))
41591 break;
41592 Mask = std::move(WidenedMask);
41593 }
41594
41595 // Canonicalization of binary shuffle masks to improve pattern matching by
41596 // commuting the inputs.
41597 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41599 std::swap(Ops[0], Ops[1]);
41600 }
41601
41602 // Try to combine into a single shuffle instruction.
41603 if (SDValue Shuffle = combineX86ShuffleChain(
41604 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41605 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41606 IsMaskedShuffle, DAG, DL, Subtarget))
41607 return Shuffle;
41608
41609 // If all the operands come from the same larger vector, fallthrough and try
41610 // to use combineX86ShuffleChainWithExtract.
41613 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41614 (RootSizeInBits / Mask.size()) != 64 ||
41615 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41616 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41617 LHS.getOperand(0) != RHS.getOperand(0))
41618 return SDValue();
41619 }
41620
41621 // If that failed and any input is extracted then try to combine as a
41622 // shuffle with the larger type.
41624 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41625 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41626 DAG, DL, Subtarget);
41627}
41628
41629/// Helper entry wrapper to combineX86ShufflesRecursively.
41631 const X86Subtarget &Subtarget) {
41633 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41634 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41635 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41636 SDLoc(Op), Subtarget);
41637}
41638
41639/// Get the PSHUF-style mask from PSHUF node.
41640///
41641/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41642/// PSHUF-style masks that can be reused with such instructions.
41644 MVT VT = N.getSimpleValueType();
41647 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41648 (void)HaveMask;
41649 assert(HaveMask);
41650
41651 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41652 // matter. Check that the upper masks are repeats and remove them.
41653 if (VT.getSizeInBits() > 128) {
41654 int LaneElts = 128 / VT.getScalarSizeInBits();
41655#ifndef NDEBUG
41656 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41657 for (int j = 0; j < LaneElts; ++j)
41658 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41659 "Mask doesn't repeat in high 128-bit lanes!");
41660#endif
41661 Mask.resize(LaneElts);
41662 }
41663
41664 switch (N.getOpcode()) {
41665 case X86ISD::PSHUFD:
41666 return Mask;
41667 case X86ISD::PSHUFLW:
41668 Mask.resize(4);
41669 return Mask;
41670 case X86ISD::PSHUFHW:
41671 Mask.erase(Mask.begin(), Mask.begin() + 4);
41672 for (int &M : Mask)
41673 M -= 4;
41674 return Mask;
41675 default:
41676 llvm_unreachable("No valid shuffle instruction found!");
41677 }
41678}
41679
41680/// Get the expanded blend mask from a BLENDI node.
41681/// For v16i16 nodes, this will splat the repeated i8 mask.
41683 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41684 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41685 APInt Mask = V.getConstantOperandAPInt(2);
41686 if (Mask.getBitWidth() > NumElts)
41687 Mask = Mask.trunc(NumElts);
41688 if (NumElts == 16) {
41689 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41690 Mask = APInt::getSplat(16, Mask);
41691 }
41692 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41693 return Mask;
41694}
41695
41696/// Search for a combinable shuffle across a chain ending in pshufd.
41697///
41698/// We walk up the chain and look for a combinable shuffle, skipping over
41699/// shuffles that we could hoist this shuffle's transformation past without
41700/// altering anything.
41703 const SDLoc &DL,
41704 SelectionDAG &DAG) {
41705 assert(N.getOpcode() == X86ISD::PSHUFD &&
41706 "Called with something other than an x86 128-bit half shuffle!");
41707
41708 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41709 // of the shuffles in the chain so that we can form a fresh chain to replace
41710 // this one.
41712 SDValue V = N.getOperand(0);
41713 for (; V.hasOneUse(); V = V.getOperand(0)) {
41714 switch (V.getOpcode()) {
41715 default:
41716 return SDValue(); // Nothing combined!
41717
41718 case ISD::BITCAST:
41719 // Skip bitcasts as we always know the type for the target specific
41720 // instructions.
41721 continue;
41722
41723 case X86ISD::PSHUFD:
41724 // Found another dword shuffle.
41725 break;
41726
41727 case X86ISD::PSHUFLW:
41728 // Check that the low words (being shuffled) are the identity in the
41729 // dword shuffle, and the high words are self-contained.
41730 if (Mask[0] != 0 || Mask[1] != 1 ||
41731 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41732 return SDValue();
41733
41734 Chain.push_back(V);
41735 continue;
41736
41737 case X86ISD::PSHUFHW:
41738 // Check that the high words (being shuffled) are the identity in the
41739 // dword shuffle, and the low words are self-contained.
41740 if (Mask[2] != 2 || Mask[3] != 3 ||
41741 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41742 return SDValue();
41743
41744 Chain.push_back(V);
41745 continue;
41746
41747 case X86ISD::UNPCKL:
41748 case X86ISD::UNPCKH:
41749 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41750 // shuffle into a preceding word shuffle.
41751 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41752 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41753 return SDValue();
41754
41755 // Search for a half-shuffle which we can combine with.
41756 unsigned CombineOp =
41757 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41758 if (V.getOperand(0) != V.getOperand(1) ||
41759 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41760 return SDValue();
41761 Chain.push_back(V);
41762 V = V.getOperand(0);
41763 do {
41764 switch (V.getOpcode()) {
41765 default:
41766 return SDValue(); // Nothing to combine.
41767
41768 case X86ISD::PSHUFLW:
41769 case X86ISD::PSHUFHW:
41770 if (V.getOpcode() == CombineOp)
41771 break;
41772
41773 Chain.push_back(V);
41774
41775 [[fallthrough]];
41776 case ISD::BITCAST:
41777 V = V.getOperand(0);
41778 continue;
41779 }
41780 break;
41781 } while (V.hasOneUse());
41782 break;
41783 }
41784 // Break out of the loop if we break out of the switch.
41785 break;
41786 }
41787
41788 if (!V.hasOneUse())
41789 // We fell out of the loop without finding a viable combining instruction.
41790 return SDValue();
41791
41792 // Merge this node's mask and our incoming mask.
41794 for (int &M : Mask)
41795 M = VMask[M];
41796 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41797 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41798
41799 // Rebuild the chain around this new shuffle.
41800 while (!Chain.empty()) {
41801 SDValue W = Chain.pop_back_val();
41802
41803 if (V.getValueType() != W.getOperand(0).getValueType())
41804 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41805
41806 switch (W.getOpcode()) {
41807 default:
41808 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41809
41810 case X86ISD::UNPCKL:
41811 case X86ISD::UNPCKH:
41812 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41813 break;
41814
41815 case X86ISD::PSHUFD:
41816 case X86ISD::PSHUFLW:
41817 case X86ISD::PSHUFHW:
41818 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41819 break;
41820 }
41821 }
41822 if (V.getValueType() != N.getValueType())
41823 V = DAG.getBitcast(N.getValueType(), V);
41824
41825 // Return the new chain to replace N.
41826 return V;
41827}
41828
41829// Attempt to commute shufps LHS loads:
41830// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41832 SelectionDAG &DAG) {
41833 // TODO: Add vXf64 support.
41834 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41835 return SDValue();
41836
41837 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41838 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41839 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41840 return SDValue();
41841 SDValue N0 = V.getOperand(0);
41842 SDValue N1 = V.getOperand(1);
41843 unsigned Imm = V.getConstantOperandVal(2);
41844 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41845 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41847 return SDValue();
41848 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41849 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41850 DAG.getTargetConstant(Imm, DL, MVT::i8));
41851 };
41852
41853 switch (N.getOpcode()) {
41854 case X86ISD::VPERMILPI:
41855 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41856 unsigned Imm = N.getConstantOperandVal(1);
41857 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41858 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41859 }
41860 break;
41861 case X86ISD::SHUFP: {
41862 SDValue N0 = N.getOperand(0);
41863 SDValue N1 = N.getOperand(1);
41864 unsigned Imm = N.getConstantOperandVal(2);
41865 if (N0 == N1) {
41866 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41867 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41868 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41869 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41870 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41871 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41872 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41873 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41874 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41875 }
41876 break;
41877 }
41878 }
41879
41880 return SDValue();
41881}
41882
41883// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41884// iff we don't demand the same element index for both X and Y.
41885static SDValue
41887 const APInt &DemandedElts, SelectionDAG &DAG,
41888 const X86Subtarget &Subtarget, const SDLoc &DL) {
41889 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41890 if (!N0.hasOneUse() || !N1.hasOneUse())
41891 return SDValue();
41892
41893 unsigned NumElts = VT.getVectorNumElements();
41896
41897 // See if both operands are shuffles, and that we can scale the shuffle masks
41898 // to the same width as the blend mask.
41899 // TODO: Support SM_SentinelZero?
41900 SmallVector<SDValue, 2> Ops0, Ops1;
41901 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41902 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41903 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41904 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41905 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41906 return SDValue();
41907
41908 // Determine the demanded elts from both permutes.
41909 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41910 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41911 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41912 Demanded1,
41913 /*AllowUndefElts=*/true) ||
41914 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41915 DemandedRHS0, /*AllowUndefElts=*/true) ||
41916 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41917 DemandedRHS1, /*AllowUndefElts=*/true))
41918 return SDValue();
41919
41920 // Confirm that we only use a single operand from both permutes and that we
41921 // don't demand the same index from both.
41922 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41923 DemandedLHS0.intersects(DemandedLHS1))
41924 return SDValue();
41925
41926 // Use the permute demanded elts masks as the new blend mask.
41927 // Create the new permute mask as a blend of the 2 original permute masks.
41928 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41929 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41930 for (unsigned I = 0; I != NumElts; ++I) {
41931 if (Demanded0[I]) {
41932 int M = ScaledMask0[I];
41933 if (0 <= M) {
41934 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41935 "BlendMask demands LHS AND RHS");
41936 NewBlendMask[M] = M;
41937 NewPermuteMask[I] = M;
41938 }
41939 } else if (Demanded1[I]) {
41940 int M = ScaledMask1[I];
41941 if (0 <= M) {
41942 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41943 "BlendMask demands LHS AND RHS");
41944 NewBlendMask[M] = M + NumElts;
41945 NewPermuteMask[I] = M;
41946 }
41947 }
41948 }
41949 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41950 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41951
41952 // v16i16 shuffles can explode in complexity very easily, only accept them if
41953 // the blend mask is the same in the 128-bit subvectors (or can widen to
41954 // v8i32) and the permute can be widened as well.
41955 if (VT == MVT::v16i16) {
41956 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41957 !canWidenShuffleElements(NewBlendMask))
41958 return SDValue();
41959 if (!canWidenShuffleElements(NewPermuteMask))
41960 return SDValue();
41961 }
41962
41963 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41964 // widened to a lane permute (vperm2f128).
41965 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41967 NewPermuteMask) &&
41968 !canScaleShuffleElements(NewPermuteMask, 2))
41969 return SDValue();
41970
41971 SDValue NewBlend =
41972 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41973 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41974 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41975 NewPermuteMask);
41976}
41977
41978// TODO - move this to TLI like isBinOp?
41979static bool isUnaryOp(unsigned Opcode) {
41980 switch (Opcode) {
41981 case ISD::CTLZ:
41982 case ISD::CTTZ:
41983 case ISD::CTPOP:
41984 return true;
41985 }
41986 return false;
41987}
41988
41989// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41990// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41992 const SDLoc &DL) {
41993 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41994 EVT ShuffleVT = N.getValueType();
41995 unsigned Opc = N.getOpcode();
41996
41997 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41998 // AllZeros/AllOnes constants are freely shuffled and will peek through
41999 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42000 // merge with target shuffles if it has one use so shuffle combining is
42001 // likely to kick in. Shuffles of splats are expected to be removed.
42002 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42003 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42007 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42008 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42009 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42010 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42011 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42012 };
42013 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42014 // Ensure we only shuffle whole vector src elements, unless its a logical
42015 // binops where we can more aggressively move shuffles from dst to src.
42016 return isLogicOp(BinOp) ||
42017 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42018 };
42019
42020 switch (Opc) {
42021 // Unary and Unary+Permute Shuffles.
42022 case X86ISD::PSHUFB: {
42023 // Don't merge PSHUFB if it contains zero'd elements.
42024 SmallVector<int> Mask;
42026 if (!getTargetShuffleMask(N, false, Ops, Mask))
42027 break;
42028 [[fallthrough]];
42029 }
42030 case X86ISD::VBROADCAST:
42031 case X86ISD::MOVDDUP:
42032 case X86ISD::PSHUFD:
42033 case X86ISD::PSHUFHW:
42034 case X86ISD::PSHUFLW:
42035 case X86ISD::VPERMV:
42036 case X86ISD::VPERMI:
42037 case X86ISD::VPERMILPI: {
42038 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42039 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42040 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42041 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42042 unsigned SrcOpcode = N0.getOpcode();
42043 EVT OpVT = N0.getValueType();
42044 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42047 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42048 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42049 IsMergeableWithShuffle(Op01, FoldShuf)) {
42050 SDValue LHS, RHS;
42051 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42052 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42053 if (Opc == X86ISD::VPERMV) {
42054 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42055 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42056 } else if (N.getNumOperands() == 2) {
42057 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42058 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42059 } else {
42060 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42061 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42062 }
42063 return DAG.getBitcast(ShuffleVT,
42064 DAG.getNode(SrcOpcode, DL, OpVT,
42065 DAG.getBitcast(OpVT, LHS),
42066 DAG.getBitcast(OpVT, RHS)));
42067 }
42068 }
42069 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42070 OpVT.getScalarSizeInBits() ==
42072 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42073 if (Opc == X86ISD::VPERMV)
42074 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42075 else if (N.getNumOperands() == 2)
42076 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42077 else
42078 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42079 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42080 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42081 }
42082 }
42083 break;
42084 }
42085 // Binary and Binary+Permute Shuffles.
42086 case X86ISD::INSERTPS: {
42087 // Don't merge INSERTPS if it contains zero'd elements.
42088 unsigned InsertPSMask = N.getConstantOperandVal(2);
42089 unsigned ZeroMask = InsertPSMask & 0xF;
42090 if (ZeroMask != 0)
42091 break;
42092 [[fallthrough]];
42093 }
42094 case X86ISD::MOVSD:
42095 case X86ISD::MOVSS:
42096 case X86ISD::BLENDI:
42097 case X86ISD::SHUFP:
42098 case X86ISD::UNPCKH:
42099 case X86ISD::UNPCKL: {
42100 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42101 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42102 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42103 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42104 unsigned SrcOpcode = N0.getOpcode();
42105 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42106 N0.getValueType() == N1.getValueType() &&
42107 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42108 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42113 // Ensure the total number of shuffles doesn't increase by folding this
42114 // shuffle through to the source ops.
42115 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42116 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42117 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42118 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42119 SDValue LHS, RHS;
42120 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42121 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42122 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42123 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42124 if (N.getNumOperands() == 3) {
42125 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42126 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42127 } else {
42128 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42129 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42130 }
42131 EVT OpVT = N0.getValueType();
42132 return DAG.getBitcast(ShuffleVT,
42133 DAG.getNode(SrcOpcode, DL, OpVT,
42134 DAG.getBitcast(OpVT, LHS),
42135 DAG.getBitcast(OpVT, RHS)));
42136 }
42137 }
42138 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42139 N0.getValueType() == N1.getValueType() &&
42140 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42141 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42144 SDValue Res;
42145 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42146 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42147 if (N.getNumOperands() == 3) {
42148 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42149 } else {
42150 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42151 }
42152 EVT OpVT = N0.getValueType();
42153 return DAG.getBitcast(
42154 ShuffleVT,
42155 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42156 }
42157 // TODO: We can generalize this for other shuffles/conversions.
42158 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42159 N1.getOpcode() == SrcOpcode &&
42160 N0.getValueType() == N1.getValueType() &&
42161 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42162 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42163 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42164 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42165 EVT OpSrcVT = N0.getOperand(0).getValueType();
42166 EVT OpDstVT = N0.getValueType();
42167 SDValue Res =
42168 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42169 return DAG.getBitcast(ShuffleVT,
42170 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42171 }
42172 }
42173 break;
42174 }
42175 }
42176 return SDValue();
42177}
42178
42179/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42181 SelectionDAG &DAG,
42182 const SDLoc &DL) {
42183 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42184
42185 MVT VT = V.getSimpleValueType();
42186 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42187 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42188 unsigned SrcOpc0 = Src0.getOpcode();
42189 unsigned SrcOpc1 = Src1.getOpcode();
42190 EVT SrcVT0 = Src0.getValueType();
42191 EVT SrcVT1 = Src1.getValueType();
42192
42193 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42194 return SDValue();
42195
42196 switch (SrcOpc0) {
42197 case X86ISD::MOVDDUP: {
42198 SDValue LHS = Src0.getOperand(0);
42199 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42200 SDValue Res =
42201 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42202 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42203 return DAG.getBitcast(VT, Res);
42204 }
42205 case X86ISD::VPERMILPI:
42206 // TODO: Handle v4f64 permutes with different low/high lane masks.
42207 if (SrcVT0 == MVT::v4f64) {
42208 uint64_t Mask = Src0.getConstantOperandVal(1);
42209 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42210 break;
42211 }
42212 [[fallthrough]];
42213 case X86ISD::VSHLI:
42214 case X86ISD::VSRLI:
42215 case X86ISD::VSRAI:
42216 case X86ISD::PSHUFD:
42217 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42218 SDValue LHS = Src0.getOperand(0);
42219 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42220 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42221 V.getOperand(2));
42222 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42223 return DAG.getBitcast(VT, Res);
42224 }
42225 break;
42226 }
42227
42228 return SDValue();
42229}
42230
42231/// Try to combine x86 target specific shuffles.
42233 SelectionDAG &DAG,
42235 const X86Subtarget &Subtarget) {
42236 using namespace SDPatternMatch;
42237
42238 MVT VT = N.getSimpleValueType();
42239 unsigned NumElts = VT.getVectorNumElements();
42241 unsigned Opcode = N.getOpcode();
42242 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42243
42244 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42245 return R;
42246
42247 // Handle specific target shuffles.
42248 switch (Opcode) {
42249 case X86ISD::MOVDDUP: {
42250 SDValue Src = N.getOperand(0);
42251 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42252 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42253 ISD::isNormalLoad(Src.getNode())) {
42254 LoadSDNode *LN = cast<LoadSDNode>(Src);
42255 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42256 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42257 DCI.CombineTo(N.getNode(), Movddup);
42258 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42260 return N; // Return N so it doesn't get rechecked!
42261 }
42262 }
42263
42264 return SDValue();
42265 }
42266 case X86ISD::VBROADCAST: {
42267 SDValue Src = N.getOperand(0);
42268 SDValue BC = peekThroughBitcasts(Src);
42269 EVT SrcVT = Src.getValueType();
42270 EVT BCVT = BC.getValueType();
42271
42272 // If broadcasting from another shuffle, attempt to simplify it.
42273 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42274 if (isTargetShuffle(BC.getOpcode()) &&
42275 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42276 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42277 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42279 for (unsigned i = 0; i != Scale; ++i)
42280 DemandedMask[i] = i;
42282 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42283 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42284 /*AllowVariableCrossLaneMask=*/true,
42285 /*AllowVariablePerLaneMask=*/true,
42286 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42287 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42288 DAG.getBitcast(SrcVT, Res));
42289 }
42290
42291 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42292 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42293 if (Src.getOpcode() == ISD::BITCAST &&
42294 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42295 TLI.isTypeLegal(BCVT) &&
42297 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42298 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42300 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42301 }
42302
42303 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42304 // If we're re-broadcasting a smaller type then broadcast with that type and
42305 // bitcast.
42306 // TODO: Do this for any splat?
42307 if (Src.getOpcode() == ISD::BITCAST &&
42308 (BC.getOpcode() == X86ISD::VBROADCAST ||
42310 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42311 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42312 MVT NewVT =
42314 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42315 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42316 }
42317
42318 // Reduce broadcast source vector to lowest 128-bits.
42319 if (SrcVT.getSizeInBits() > 128)
42320 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42321 extract128BitVector(Src, 0, DAG, DL));
42322
42323 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42324 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42325 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42326 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42327
42328 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42329 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42330 isNullConstant(Src.getOperand(1)) &&
42331 Src.getValueType() ==
42332 Src.getOperand(0).getValueType().getScalarType() &&
42333 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42334 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42335
42336 // Share broadcast with the longest vector and extract low subvector (free).
42337 // Ensure the same SDValue from the SDNode use is being used.
42338 for (SDNode *User : Src->users())
42339 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42340 Src == User->getOperand(0) &&
42341 User->getValueSizeInBits(0).getFixedValue() >
42342 VT.getFixedSizeInBits()) {
42343 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42344 VT.getSizeInBits());
42345 }
42346
42347 // vbroadcast(scalarload X) -> vbroadcast_load X
42348 // For float loads, extract other uses of the scalar from the broadcast.
42349 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42350 ISD::isNormalLoad(Src.getNode())) {
42351 LoadSDNode *LN = cast<LoadSDNode>(Src);
42352 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42353 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42354 SDValue BcastLd =
42356 LN->getMemoryVT(), LN->getMemOperand());
42357 // If the load value is used only by N, replace it via CombineTo N.
42358 bool NoReplaceExtract = Src.hasOneUse();
42359 DCI.CombineTo(N.getNode(), BcastLd);
42360 if (NoReplaceExtract) {
42361 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42363 } else {
42364 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42365 DAG.getVectorIdxConstant(0, DL));
42366 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42367 }
42368 return N; // Return N so it doesn't get rechecked!
42369 }
42370
42371 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42372 // i16. So shrink it ourselves if we can make a broadcast_load.
42373 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42374 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42375 assert(Subtarget.hasAVX2() && "Expected AVX2");
42376 SDValue TruncIn = Src.getOperand(0);
42377
42378 // If this is a truncate of a non extending load we can just narrow it to
42379 // use a broadcast_load.
42380 if (ISD::isNormalLoad(TruncIn.getNode())) {
42381 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42382 // Unless its volatile or atomic.
42383 if (LN->isSimple()) {
42384 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42385 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42386 SDValue BcastLd = DAG.getMemIntrinsicNode(
42387 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42388 LN->getPointerInfo(), LN->getBaseAlign(),
42389 LN->getMemOperand()->getFlags());
42390 DCI.CombineTo(N.getNode(), BcastLd);
42391 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42392 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42393 return N; // Return N so it doesn't get rechecked!
42394 }
42395 }
42396
42397 // If this is a truncate of an i16 extload, we can directly replace it.
42398 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42399 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42400 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42401 if (LN->getMemoryVT().getSizeInBits() == 16) {
42402 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42403 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42404 SDValue BcastLd =
42406 LN->getMemoryVT(), LN->getMemOperand());
42407 DCI.CombineTo(N.getNode(), BcastLd);
42408 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42409 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42410 return N; // Return N so it doesn't get rechecked!
42411 }
42412 }
42413
42414 // If this is a truncate of load that has been shifted right, we can
42415 // offset the pointer and use a narrower load.
42416 if (TruncIn.getOpcode() == ISD::SRL &&
42417 TruncIn.getOperand(0).hasOneUse() &&
42418 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42419 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42420 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42421 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42422 // Make sure the shift amount and the load size are divisible by 16.
42423 // Don't do this if the load is volatile or atomic.
42424 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42425 LN->isSimple()) {
42426 unsigned Offset = ShiftAmt / 8;
42427 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42430 SDValue Ops[] = { LN->getChain(), Ptr };
42431 SDValue BcastLd = DAG.getMemIntrinsicNode(
42432 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42434 LN->getMemOperand()->getFlags());
42435 DCI.CombineTo(N.getNode(), BcastLd);
42436 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42437 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42438 return N; // Return N so it doesn't get rechecked!
42439 }
42440 }
42441 }
42442
42443 // vbroadcast(vzload X) -> vbroadcast_load X
42444 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42446 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42447 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42448 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42449 SDValue BcastLd =
42451 LN->getMemoryVT(), LN->getMemOperand());
42452 DCI.CombineTo(N.getNode(), BcastLd);
42453 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42455 return N; // Return N so it doesn't get rechecked!
42456 }
42457 }
42458
42459 // vbroadcast(vector load X) -> vbroadcast_load
42460 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42461 LoadSDNode *LN = cast<LoadSDNode>(Src);
42462 // Unless the load is volatile or atomic.
42463 if (LN->isSimple()) {
42464 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42465 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42466 SDValue BcastLd = DAG.getMemIntrinsicNode(
42468 LN->getPointerInfo(), LN->getBaseAlign(),
42469 LN->getMemOperand()->getFlags());
42470 DCI.CombineTo(N.getNode(), BcastLd);
42471 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42473 return N; // Return N so it doesn't get rechecked!
42474 }
42475 }
42476
42477 return SDValue();
42478 }
42479 case X86ISD::VZEXT_MOVL: {
42480 SDValue N0 = N.getOperand(0);
42481
42482 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42483 // Zeroing out the upper elements means we're just shifting a zero value.
42484 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42485 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42486 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42487 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42488 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42489 if (N0.hasOneUse())
42490 return DAG.getNode(
42491 N0.getOpcode(), DL, VT,
42492 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42493 N0.getOperand(1));
42494 }
42495
42496 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42497 // the load is volatile.
42498 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42499 auto *LN = cast<LoadSDNode>(N0);
42500 if (SDValue VZLoad =
42501 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42502 DCI.CombineTo(N.getNode(), VZLoad);
42503 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42505 return N;
42506 }
42507 }
42508
42509 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42510 // and can just use a VZEXT_LOAD.
42511 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42512 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42513 auto *LN = cast<MemSDNode>(N0);
42514 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42515 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42516 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42517 SDValue VZLoad =
42519 LN->getMemoryVT(), LN->getMemOperand());
42520 DCI.CombineTo(N.getNode(), VZLoad);
42521 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42523 return N;
42524 }
42525 }
42526
42527 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42528 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42529 // if the upper bits of the i64 are zero.
42530 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42531 N0.getOperand(0).hasOneUse() &&
42532 N0.getOperand(0).getValueType() == MVT::i64) {
42533 SDValue In = N0.getOperand(0);
42534 APInt Mask = APInt::getHighBitsSet(64, 32);
42535 if (DAG.MaskedValueIsZero(In, Mask)) {
42536 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42537 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42538 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42539 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42540 return DAG.getBitcast(VT, Movl);
42541 }
42542 }
42543
42544 // Load a scalar integer constant directly to XMM instead of transferring an
42545 // immediate value from GPR.
42546 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42547 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42548 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42549 // Create a vector constant - scalar constant followed by zeros.
42550 EVT ScalarVT = N0.getOperand(0).getValueType();
42551 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42552 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42553 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42554 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42555
42556 // Load the vector constant from constant pool.
42557 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42558 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42559 MachinePointerInfo MPI =
42561 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42562 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42564 }
42565 }
42566
42567 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42568 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42569 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42570 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42571 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42573
42574 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42575 isNullConstant(V.getOperand(2))) {
42576 SDValue In = V.getOperand(1);
42578 In.getValueSizeInBits() /
42579 VT.getScalarSizeInBits());
42580 In = DAG.getBitcast(SubVT, In);
42581 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42582 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42583 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42584 V.getOperand(2));
42585 }
42586 }
42587
42588 return SDValue();
42589 }
42590 case X86ISD::BLENDI: {
42591 SDValue N0 = N.getOperand(0);
42592 SDValue N1 = N.getOperand(1);
42593 unsigned EltBits = VT.getScalarSizeInBits();
42594
42595 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42596 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42597 // TODO: Handle MVT::v16i16 repeated blend mask.
42598 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42599 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42600 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42601 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42602 unsigned NewSize = SrcVT.getVectorNumElements();
42603 APInt BlendMask = getBLENDIBlendMask(N);
42604 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42605 return DAG.getBitcast(
42606 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42607 N1.getOperand(0),
42608 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42609 DL, MVT::i8)));
42610 }
42611 }
42612 // Share PSHUFB masks:
42613 // blend(pshufb(x,m1),pshufb(y,m2))
42614 // --> m3 = blend(m1,m2)
42615 // blend(pshufb(x,m3),pshufb(y,m3))
42616 if (N0.hasOneUse() && N1.hasOneUse()) {
42617 SmallVector<int> Mask, ByteMask;
42621 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42622 RHS.getOpcode() == X86ISD::PSHUFB &&
42623 LHS.getOperand(1) != RHS.getOperand(1) &&
42624 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42625 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42626 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42628 "BLENDI decode mismatch");
42629 MVT ShufVT = LHS.getSimpleValueType();
42630 SDValue MaskLHS = LHS.getOperand(1);
42631 SDValue MaskRHS = RHS.getOperand(1);
42632 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42634 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42635 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42636 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42637 LHS.getOperand(0), NewMask);
42638 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42639 RHS.getOperand(0), NewMask);
42640 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42641 DAG.getBitcast(VT, NewLHS),
42642 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42643 }
42644 }
42645 }
42646 }
42647 return SDValue();
42648 }
42649 case X86ISD::SHUFP: {
42650 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42651 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42652 // TODO: Support types other than v4f32.
42653 if (VT == MVT::v4f32) {
42654 bool Updated = false;
42655 SmallVector<int> Mask;
42657 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42658 for (int i = 0; i != 2; ++i) {
42659 SmallVector<SDValue> SubOps;
42660 SmallVector<int> SubMask, SubScaledMask;
42662 // TODO: Scaling might be easier if we specify the demanded elts.
42663 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42664 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42665 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42666 int Ofs = i * 2;
42667 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42668 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42669 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42670 Updated = true;
42671 }
42672 }
42673 }
42674 if (Updated) {
42675 for (int &M : Mask)
42676 M %= 4;
42677 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42678 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42679 }
42680 }
42681 return SDValue();
42682 }
42683 case X86ISD::VPERMI: {
42684 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42685 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42686 SDValue N0 = N.getOperand(0);
42687 SDValue N1 = N.getOperand(1);
42688 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42689 if (N0.getOpcode() == ISD::BITCAST &&
42690 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42691 SDValue Src = N0.getOperand(0);
42692 EVT SrcVT = Src.getValueType();
42693 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42694 return DAG.getBitcast(VT, Res);
42695 }
42696 return SDValue();
42697 }
42698 case X86ISD::SHUF128: {
42699 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42700 // see if we can peek through and access the subvector directly.
42701 if (VT.is512BitVector()) {
42702 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42703 // the upper subvector is used.
42704 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42705 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42706 uint64_t Mask = N->getConstantOperandVal(2);
42707 SmallVector<SDValue> LHSOps, RHSOps;
42708 SDValue NewLHS, NewRHS;
42709 if ((Mask & 0x0A) == 0x0A &&
42710 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42711 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42712 Mask &= ~0x0A;
42713 }
42714 if ((Mask & 0xA0) == 0xA0 &&
42715 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42716 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42717 Mask &= ~0xA0;
42718 }
42719 if (NewLHS || NewRHS)
42720 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42721 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42722 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42723 DAG.getTargetConstant(Mask, DL, MVT::i8));
42724 }
42725 return SDValue();
42726 }
42727 case X86ISD::VPERM2X128: {
42728 SDValue LHS = N->getOperand(0);
42729 SDValue RHS = N->getOperand(1);
42730 unsigned Imm = N.getConstantOperandVal(2) & 255;
42731
42732 // Canonicalize unary/repeated operands to LHS.
42733 if (LHS.isUndef() && !RHS.isUndef())
42734 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42735 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42736 if (LHS == RHS)
42737 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42738 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42739
42740 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42741 if (LHS.getOpcode() == ISD::BITCAST &&
42742 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42743 EVT SrcVT = LHS.getOperand(0).getValueType();
42744 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42745 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42746 DAG.getBitcast(SrcVT, LHS),
42747 DAG.getBitcast(SrcVT, RHS),
42748 N->getOperand(2)));
42749 }
42750 }
42751
42752 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42754 return Res;
42755
42756 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42757 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42758 auto FindSubVector128 = [&](unsigned Idx) {
42759 if (Idx > 3)
42760 return SDValue();
42761 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42762 SmallVector<SDValue> SubOps;
42763 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42764 return SubOps[Idx & 1];
42765 unsigned NumElts = Src.getValueType().getVectorNumElements();
42766 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42767 Src.getOperand(1).getValueSizeInBits() == 128 &&
42768 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42769 return Src.getOperand(1);
42770 }
42771 return SDValue();
42772 };
42773 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42774 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42775 MVT SubVT = VT.getHalfNumVectorElementsVT();
42776 SubLo = DAG.getBitcast(SubVT, SubLo);
42777 SubHi = DAG.getBitcast(SubVT, SubHi);
42778 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42779 }
42780 }
42781
42782 // Attempt to match VBROADCAST*128 subvector broadcast load.
42783 if (RHS.isUndef()) {
42785 DecodeVPERM2X128Mask(4, Imm, Mask);
42786 if (isUndefOrInRange(Mask, 0, 4)) {
42787 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42788 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42789 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42790 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42791 MVT MemVT = VT.getHalfNumVectorElementsVT();
42792 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42794 cast<LoadSDNode>(LHS), Ofs, DAG);
42795 }
42796 }
42797 }
42798
42799 return SDValue();
42800 }
42801 case X86ISD::PSHUFD:
42802 case X86ISD::PSHUFLW:
42803 case X86ISD::PSHUFHW: {
42804 SDValue N0 = N.getOperand(0);
42805 SDValue N1 = N.getOperand(1);
42806 if (N0->hasOneUse()) {
42808 switch (V.getOpcode()) {
42809 case X86ISD::VSHL:
42810 case X86ISD::VSRL:
42811 case X86ISD::VSRA:
42812 case X86ISD::VSHLI:
42813 case X86ISD::VSRLI:
42814 case X86ISD::VSRAI:
42815 case X86ISD::VROTLI:
42816 case X86ISD::VROTRI: {
42817 MVT InnerVT = V.getSimpleValueType();
42818 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42819 SDValue Res = DAG.getNode(Opcode, DL, VT,
42820 DAG.getBitcast(VT, V.getOperand(0)), N1);
42821 Res = DAG.getBitcast(InnerVT, Res);
42822 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42823 return DAG.getBitcast(VT, Res);
42824 }
42825 break;
42826 }
42827 }
42828 }
42829
42830 Mask = getPSHUFShuffleMask(N);
42831 assert(Mask.size() == 4);
42832 break;
42833 }
42834 case X86ISD::MOVSD:
42835 case X86ISD::MOVSH:
42836 case X86ISD::MOVSS: {
42837 SDValue N0 = N.getOperand(0);
42838 SDValue N1 = N.getOperand(1);
42839
42840 // Canonicalize scalar FPOps:
42841 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42842 // If commutable, allow OP(N1[0], N0[0]).
42843 unsigned Opcode1 = N1.getOpcode();
42844 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42845 Opcode1 == ISD::FDIV) {
42846 SDValue N10 = N1.getOperand(0);
42847 SDValue N11 = N1.getOperand(1);
42848 if (N10 == N0 ||
42849 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42850 if (N10 != N0)
42851 std::swap(N10, N11);
42852 MVT SVT = VT.getVectorElementType();
42853 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42854 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42855 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42856 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42857 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42858 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42859 }
42860 }
42861
42862 return SDValue();
42863 }
42864 case X86ISD::INSERTPS: {
42865 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42866 SDValue Op0 = N.getOperand(0);
42867 SDValue Op1 = N.getOperand(1);
42868 unsigned InsertPSMask = N.getConstantOperandVal(2);
42869 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42870 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42871 unsigned ZeroMask = InsertPSMask & 0xF;
42872
42873 // If we zero out all elements from Op0 then we don't need to reference it.
42874 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42875 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42876 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42877
42878 // If we zero out the element from Op1 then we don't need to reference it.
42879 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42880 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42881 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42882
42883 // Attempt to merge insertps Op1 with an inner target shuffle node.
42884 SmallVector<int, 8> TargetMask1;
42886 APInt KnownUndef1, KnownZero1;
42887 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42888 KnownZero1)) {
42889 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42890 // Zero/UNDEF insertion - zero out element and remove dependency.
42891 InsertPSMask |= (1u << DstIdx);
42892 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42893 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42894 }
42895 // Update insertps mask srcidx and reference the source input directly.
42896 int M = TargetMask1[SrcIdx];
42897 assert(0 <= M && M < 8 && "Shuffle index out of range");
42898 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42899 Op1 = Ops1[M < 4 ? 0 : 1];
42900 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42901 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42902 }
42903
42904 // Attempt to merge insertps Op0 with an inner target shuffle node.
42905 SmallVector<int, 8> TargetMask0;
42907 APInt KnownUndef0, KnownZero0;
42908 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42909 KnownZero0)) {
42910 bool Updated = false;
42911 bool UseInput00 = false;
42912 bool UseInput01 = false;
42913 for (int i = 0; i != 4; ++i) {
42914 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42915 // No change if element is already zero or the inserted element.
42916 continue;
42917 }
42918
42919 if (KnownUndef0[i] || KnownZero0[i]) {
42920 // If the target mask is undef/zero then we must zero the element.
42921 InsertPSMask |= (1u << i);
42922 Updated = true;
42923 continue;
42924 }
42925
42926 // The input vector element must be inline.
42927 int M = TargetMask0[i];
42928 if (M != i && M != (i + 4))
42929 return SDValue();
42930
42931 // Determine which inputs of the target shuffle we're using.
42932 UseInput00 |= (0 <= M && M < 4);
42933 UseInput01 |= (4 <= M);
42934 }
42935
42936 // If we're not using both inputs of the target shuffle then use the
42937 // referenced input directly.
42938 if (UseInput00 && !UseInput01) {
42939 Updated = true;
42940 Op0 = Ops0[0];
42941 } else if (!UseInput00 && UseInput01) {
42942 Updated = true;
42943 Op0 = Ops0[1];
42944 }
42945
42946 if (Updated)
42947 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42948 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42949 }
42950
42951 // If we're inserting an element from a vbroadcast load, fold the
42952 // load into the X86insertps instruction. We need to convert the scalar
42953 // load to a vector and clear the source lane of the INSERTPS control.
42954 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42955 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42956 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42957 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42958 MemIntr->getBasePtr(),
42959 MemIntr->getMemOperand());
42960 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42962 Load),
42963 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42964 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42965 return Insert;
42966 }
42967 }
42968
42969 return SDValue();
42970 }
42971 case X86ISD::VPERMV: {
42972 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
42974 SmallVector<SDValue, 2> SrcOps, SubOps;
42975 SDValue Src = peekThroughBitcasts(N.getOperand(1));
42976 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
42977 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42978 collectConcatOps(Src.getNode(), SubOps, DAG)) {
42979 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42980 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42981 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42982 "Unexpected split ops");
42983 // Bail if we were permuting a widened vector.
42984 if (SubOps[1].isUndef() &&
42985 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
42986 return SDValue();
42987 // Bail if any subops would have folded into the concat.
42988 if (any_of(SubOps, isShuffleFoldableLoad))
42989 return SDValue();
42990 // Concat 4x128 back to 2x256.
42991 if (SubOps.size() == 4) {
42992 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42993 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42994 }
42995 // Convert mask to 2 operand shuffle.
42996 int HalfElts = NumElts / 2;
42997 for (int &M : Mask)
42998 M += M >= HalfElts ? HalfElts : 0;
42999 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43000 VT.getSizeInBits());
43001 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43002 VT.getSizeInBits());
43003 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43004 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43005 }
43006 return SDValue();
43007 }
43008 case X86ISD::VPERMV3: {
43009 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43010 bool CanConcat = VT.is128BitVector() ||
43011 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43014 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43015 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43016 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43017 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43018 // Canonicalize to VPERMV if both sources are the same.
43019 if (V1 == V2) {
43020 for (int &M : Mask)
43021 M = (M < 0 ? M : (M & (NumElts - 1)));
43022 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43023 DAG.getUNDEF(VT), Subtarget, DAG);
43024 }
43025 // If sources are half width, then concat and use VPERMV with adjusted
43026 // mask.
43027 SDValue Ops[2];
43028 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43029 if (sd_match(V1,
43031 sd_match(V2,
43033 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43034 if (SDValue ConcatSrc =
43035 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43036 for (int &M : Mask)
43037 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43038 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43039 DAG.getUNDEF(VT), Subtarget, DAG);
43040 }
43041 }
43042 // Commute foldable source to the RHS.
43043 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43044 !isShuffleFoldableLoad(N.getOperand(2))) {
43046 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43047 N.getOperand(0), Subtarget, DAG);
43048 }
43049 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43050 // freely concatenated, with a commuted shuffle mask.
43051 if (CanConcat) {
43052 if (SDValue ConcatSrc = combineConcatVectorOps(
43053 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43054 Subtarget)) {
43056 Mask.append(NumElts, SM_SentinelUndef);
43057 SDValue Perm =
43058 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43059 DAG.getUNDEF(WideVT), Subtarget, DAG);
43060 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43061 DAG.getVectorIdxConstant(0, DL));
43062 }
43063 }
43064 }
43065 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43066 // freely concatenated.
43067 if (CanConcat) {
43068 if (SDValue ConcatSrc = combineConcatVectorOps(
43069 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43070 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43071 DL, WideVT.getSizeInBits());
43072 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43074 DAG.getVectorIdxConstant(0, DL));
43075 }
43076 }
43077 return SDValue();
43078 }
43079 default:
43080 return SDValue();
43081 }
43082
43083 // Nuke no-op shuffles that show up after combining.
43084 if (isNoopShuffleMask(Mask))
43085 return N.getOperand(0);
43086
43087 // Look for simplifications involving one or two shuffle instructions.
43088 SDValue V = N.getOperand(0);
43089 switch (N.getOpcode()) {
43090 default:
43091 break;
43092 case X86ISD::PSHUFLW:
43093 case X86ISD::PSHUFHW:
43094 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43095
43096 // See if this reduces to a PSHUFD which is no more expensive and can
43097 // combine with more operations. Note that it has to at least flip the
43098 // dwords as otherwise it would have been removed as a no-op.
43099 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43100 int DMask[] = {0, 1, 2, 3};
43101 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43102 DMask[DOffset + 0] = DOffset + 1;
43103 DMask[DOffset + 1] = DOffset + 0;
43104 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43105 V = DAG.getBitcast(DVT, V);
43106 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43107 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43108 return DAG.getBitcast(VT, V);
43109 }
43110
43111 // Look for shuffle patterns which can be implemented as a single unpack.
43112 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43113 // only works when we have a PSHUFD followed by two half-shuffles.
43114 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43115 (V.getOpcode() == X86ISD::PSHUFLW ||
43116 V.getOpcode() == X86ISD::PSHUFHW) &&
43117 V.getOpcode() != N.getOpcode() &&
43118 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43119 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43120 if (D.getOpcode() == X86ISD::PSHUFD) {
43123 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43124 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43125 int WordMask[8];
43126 for (int i = 0; i < 4; ++i) {
43127 WordMask[i + NOffset] = Mask[i] + NOffset;
43128 WordMask[i + VOffset] = VMask[i] + VOffset;
43129 }
43130 // Map the word mask through the DWord mask.
43131 int MappedMask[8];
43132 for (int i = 0; i < 8; ++i)
43133 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43134 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43135 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43136 // We can replace all three shuffles with an unpack.
43137 V = DAG.getBitcast(VT, D.getOperand(0));
43138 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43140 DL, VT, V, V);
43141 }
43142 }
43143 }
43144
43145 break;
43146
43147 case X86ISD::PSHUFD:
43148 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43149 return NewN;
43150
43151 break;
43152 }
43153
43154 return SDValue();
43155}
43156
43157/// Checks if the shuffle mask takes subsequent elements
43158/// alternately from two vectors.
43159/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43160static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43161
43162 int ParitySrc[2] = {-1, -1};
43163 unsigned Size = Mask.size();
43164 for (unsigned i = 0; i != Size; ++i) {
43165 int M = Mask[i];
43166 if (M < 0)
43167 continue;
43168
43169 // Make sure we are using the matching element from the input.
43170 if ((M % Size) != i)
43171 return false;
43172
43173 // Make sure we use the same input for all elements of the same parity.
43174 int Src = M / Size;
43175 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43176 return false;
43177 ParitySrc[i % 2] = Src;
43178 }
43179
43180 // Make sure each input is used.
43181 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43182 return false;
43183
43184 Op0Even = ParitySrc[0] == 0;
43185 return true;
43186}
43187
43188/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43189/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43190/// are written to the parameters \p Opnd0 and \p Opnd1.
43191///
43192/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43193/// so it is easier to generically match. We also insert dummy vector shuffle
43194/// nodes for the operands which explicitly discard the lanes which are unused
43195/// by this operation to try to flow through the rest of the combiner
43196/// the fact that they're unused.
43197static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43198 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43199 bool &IsSubAdd, bool &HasAllowContract) {
43200
43201 EVT VT = N->getValueType(0);
43202 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43203 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43205 return false;
43206
43207 // We only handle target-independent shuffles.
43208 // FIXME: It would be easy and harmless to use the target shuffle mask
43209 // extraction tool to support more.
43210 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43211 return false;
43212
43213 SDValue V1 = N->getOperand(0);
43214 SDValue V2 = N->getOperand(1);
43215
43216 // Make sure we have an FADD and an FSUB.
43217 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43218 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43219 V1.getOpcode() == V2.getOpcode())
43220 return false;
43221
43222 // If there are other uses of these operations we can't fold them.
43223 if (!V1->hasOneUse() || !V2->hasOneUse())
43224 return false;
43225
43226 // Ensure that both operations have the same operands. Note that we can
43227 // commute the FADD operands.
43228 SDValue LHS, RHS;
43229 if (V1.getOpcode() == ISD::FSUB) {
43230 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43231 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43232 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43233 return false;
43234 } else {
43235 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43236 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43237 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43238 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43239 return false;
43240 }
43241
43242 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43243 bool Op0Even;
43244 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43245 return false;
43246
43247 // It's a subadd if the vector in the even parity is an FADD.
43248 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43249 : V2->getOpcode() == ISD::FADD;
43250 HasAllowContract =
43252
43253 Opnd0 = LHS;
43254 Opnd1 = RHS;
43255 return true;
43256}
43257
43258/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43260 const X86Subtarget &Subtarget,
43261 SelectionDAG &DAG) {
43262 // We only handle target-independent shuffles.
43263 // FIXME: It would be easy and harmless to use the target shuffle mask
43264 // extraction tool to support more.
43265 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43266 return SDValue();
43267
43268 MVT VT = N->getSimpleValueType(0);
43269 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43270 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43271 return SDValue();
43272
43273 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43274 SDValue Op0 = N->getOperand(0);
43275 SDValue Op1 = N->getOperand(1);
43276 SDValue FMAdd = Op0, FMSub = Op1;
43277 if (FMSub.getOpcode() != X86ISD::FMSUB)
43278 std::swap(FMAdd, FMSub);
43279
43280 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43281 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43282 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43283 FMAdd.getOperand(2) != FMSub.getOperand(2))
43284 return SDValue();
43285
43286 // Check for correct shuffle mask.
43287 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43288 bool Op0Even;
43289 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43290 return SDValue();
43291
43292 // FMAddSub takes zeroth operand from FMSub node.
43293 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43294 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43295 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43296 FMAdd.getOperand(2));
43297}
43298
43299/// Try to combine a shuffle into a target-specific add-sub or
43300/// mul-add-sub node.
43302 const X86Subtarget &Subtarget,
43303 SelectionDAG &DAG) {
43304 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43305 return V;
43306
43307 SDValue Opnd0, Opnd1;
43308 bool IsSubAdd;
43309 bool HasAllowContract;
43310 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43311 HasAllowContract))
43312 return SDValue();
43313
43314 MVT VT = N->getSimpleValueType(0);
43315
43316 // Try to generate X86ISD::FMADDSUB node here.
43317 SDValue Opnd2;
43318 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43319 HasAllowContract)) {
43320 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43321 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43322 }
43323
43324 if (IsSubAdd)
43325 return SDValue();
43326
43327 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43328 // the ADDSUB idiom has been successfully recognized. There are no known
43329 // X86 targets with 512-bit ADDSUB instructions!
43330 if (VT.is512BitVector())
43331 return SDValue();
43332
43333 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43334 // the ADDSUB idiom has been successfully recognized. There are no known
43335 // X86 targets with FP16 ADDSUB instructions!
43336 if (VT.getVectorElementType() == MVT::f16)
43337 return SDValue();
43338
43339 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43340}
43341
43342/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43343/// low half of each source vector and does not set any high half elements in
43344/// the destination vector, narrow the shuffle to half its original size.
43346 EVT VT = Shuf->getValueType(0);
43347 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43348 return SDValue();
43349 if (!VT.is256BitVector() && !VT.is512BitVector())
43350 return SDValue();
43351
43352 // See if we can ignore all of the high elements of the shuffle.
43353 ArrayRef<int> Mask = Shuf->getMask();
43354 if (!isUndefUpperHalf(Mask))
43355 return SDValue();
43356
43357 // Check if the shuffle mask accesses only the low half of each input vector
43358 // (half-index output is 0 or 2).
43359 int HalfIdx1, HalfIdx2;
43360 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43361 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43362 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43363 return SDValue();
43364
43365 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43366 // The trick is knowing that all of the insert/extract are actually free
43367 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43368 // of narrow inputs into a narrow output, and that is always cheaper than
43369 // the wide shuffle that we started with.
43370 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43371 Shuf->getOperand(1), HalfMask, HalfIdx1,
43372 HalfIdx2, false, DAG, /*UseConcat*/ true);
43373}
43374
43377 const X86Subtarget &Subtarget) {
43378 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43379 if (SDValue V = narrowShuffle(Shuf, DAG))
43380 return V;
43381
43382 // If we have legalized the vector types, look for blends of FADD and FSUB
43383 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43384 SDLoc dl(N);
43385 EVT VT = N->getValueType(0);
43386 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43387 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43388 if (SDValue AddSub =
43389 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43390 return AddSub;
43391
43392 // Attempt to combine into a vector load/broadcast.
43394 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43395 return LD;
43396
43397 if (isTargetShuffle(N->getOpcode())) {
43398 SDValue Op(N, 0);
43399 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43400 return Shuffle;
43401
43402 // Try recursively combining arbitrary sequences of x86 shuffle
43403 // instructions into higher-order shuffles. We do this after combining
43404 // specific PSHUF instruction sequences into their minimal form so that we
43405 // can evaluate how many specialized shuffle instructions are involved in
43406 // a particular chain.
43407 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43408 return Res;
43409
43410 // Simplify source operands based on shuffle mask.
43411 // TODO - merge this into combineX86ShufflesRecursively.
43412 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43413 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43414 return SDValue(N, 0);
43415
43416 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43417 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43418 // Perform this after other shuffle combines to allow inner shuffles to be
43419 // combined away first.
43420 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43421 return BinOp;
43422 }
43423
43424 return SDValue();
43425}
43426
43427// Simplify variable target shuffle masks based on the demanded elements.
43428// TODO: Handle DemandedBits in mask indices as well?
43430 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43431 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43432 // If we're demanding all elements don't bother trying to simplify the mask.
43433 unsigned NumElts = DemandedElts.getBitWidth();
43434 if (DemandedElts.isAllOnes())
43435 return false;
43436
43437 SDValue Mask = Op.getOperand(MaskIndex);
43438 if (!Mask.hasOneUse())
43439 return false;
43440
43441 // Attempt to generically simplify the variable shuffle mask.
43442 APInt MaskUndef, MaskZero;
43443 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43444 Depth + 1))
43445 return true;
43446
43447 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43448 // TODO: Support other types from getTargetShuffleMaskIndices?
43450 EVT BCVT = BC.getValueType();
43451 auto *Load = dyn_cast<LoadSDNode>(BC);
43452 if (!Load || !Load->getBasePtr().hasOneUse())
43453 return false;
43454
43455 const Constant *C = getTargetConstantFromNode(Load);
43456 if (!C)
43457 return false;
43458
43459 Type *CTy = C->getType();
43460 if (!CTy->isVectorTy() ||
43461 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43462 return false;
43463
43464 // Handle scaling for i64 elements on 32-bit targets.
43465 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43466 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43467 return false;
43468 unsigned Scale = NumCstElts / NumElts;
43469
43470 // Simplify mask if we have an undemanded element that is not undef.
43471 bool Simplified = false;
43472 SmallVector<Constant *, 32> ConstVecOps;
43473 for (unsigned i = 0; i != NumCstElts; ++i) {
43474 Constant *Elt = C->getAggregateElement(i);
43475 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43476 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43477 Simplified = true;
43478 continue;
43479 }
43480 ConstVecOps.push_back(Elt);
43481 }
43482 if (!Simplified)
43483 return false;
43484
43485 // Generate new constant pool entry + legalize immediately for the load.
43486 SDLoc DL(Op);
43487 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43488 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43489 SDValue NewMask = TLO.DAG.getLoad(
43490 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43492 Load->getAlign());
43493 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43494}
43495
43497 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43498 TargetLoweringOpt &TLO, unsigned Depth) const {
43499 int NumElts = DemandedElts.getBitWidth();
43500 unsigned Opc = Op.getOpcode();
43501 EVT VT = Op.getValueType();
43502
43503 // Handle special case opcodes.
43504 switch (Opc) {
43505 case X86ISD::PMULDQ:
43506 case X86ISD::PMULUDQ: {
43507 APInt LHSUndef, LHSZero;
43508 APInt RHSUndef, RHSZero;
43509 SDValue LHS = Op.getOperand(0);
43510 SDValue RHS = Op.getOperand(1);
43511 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43512 Depth + 1))
43513 return true;
43514 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43515 Depth + 1))
43516 return true;
43517 // Multiply by zero.
43518 KnownZero = LHSZero | RHSZero;
43519 break;
43520 }
43521 case X86ISD::VPMADDUBSW:
43522 case X86ISD::VPMADDWD: {
43523 APInt LHSUndef, LHSZero;
43524 APInt RHSUndef, RHSZero;
43525 SDValue LHS = Op.getOperand(0);
43526 SDValue RHS = Op.getOperand(1);
43527 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43528
43529 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43530 Depth + 1))
43531 return true;
43532 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43533 Depth + 1))
43534 return true;
43535
43536 // TODO: Multiply by zero.
43537
43538 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43539 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43540 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43541 Depth + 1))
43542 return true;
43543 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43544 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43545 Depth + 1))
43546 return true;
43547 break;
43548 }
43549 case X86ISD::PSADBW: {
43550 SDValue LHS = Op.getOperand(0);
43551 SDValue RHS = Op.getOperand(1);
43552 assert(VT.getScalarType() == MVT::i64 &&
43553 LHS.getValueType() == RHS.getValueType() &&
43554 LHS.getValueType().getScalarType() == MVT::i8 &&
43555 "Unexpected PSADBW types");
43556
43557 // Aggressively peek through ops to get at the demanded elts.
43558 if (!DemandedElts.isAllOnes()) {
43559 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43560 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43562 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43564 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43565 if (NewLHS || NewRHS) {
43566 NewLHS = NewLHS ? NewLHS : LHS;
43567 NewRHS = NewRHS ? NewRHS : RHS;
43568 return TLO.CombineTo(
43569 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43570 }
43571 }
43572 break;
43573 }
43574 case X86ISD::VSHL:
43575 case X86ISD::VSRL:
43576 case X86ISD::VSRA: {
43577 // We only need the bottom 64-bits of the (128-bit) shift amount.
43578 SDValue Amt = Op.getOperand(1);
43579 MVT AmtVT = Amt.getSimpleValueType();
43580 assert(AmtVT.is128BitVector() && "Unexpected value type");
43581
43582 // If we reuse the shift amount just for sse shift amounts then we know that
43583 // only the bottom 64-bits are only ever used.
43584 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43585 unsigned UseOpc = Use->getOpcode();
43586 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43587 UseOpc == X86ISD::VSRA) &&
43588 Use->getOperand(0) != Amt;
43589 });
43590
43591 APInt AmtUndef, AmtZero;
43592 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43593 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43594 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43595 Depth + 1, AssumeSingleUse))
43596 return true;
43597 [[fallthrough]];
43598 }
43599 case X86ISD::VSHLI:
43600 case X86ISD::VSRLI:
43601 case X86ISD::VSRAI: {
43602 SDValue Src = Op.getOperand(0);
43603 APInt SrcUndef;
43604 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43605 Depth + 1))
43606 return true;
43607
43608 // Fold shift(0,x) -> 0
43609 if (DemandedElts.isSubsetOf(KnownZero))
43610 return TLO.CombineTo(
43611 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43612
43613 // Aggressively peek through ops to get at the demanded elts.
43614 if (!DemandedElts.isAllOnes())
43616 Src, DemandedElts, TLO.DAG, Depth + 1))
43617 return TLO.CombineTo(
43618 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43619 break;
43620 }
43621 case X86ISD::VPSHA:
43622 case X86ISD::VPSHL:
43623 case X86ISD::VSHLV:
43624 case X86ISD::VSRLV:
43625 case X86ISD::VSRAV: {
43626 APInt LHSUndef, LHSZero;
43627 APInt RHSUndef, RHSZero;
43628 SDValue LHS = Op.getOperand(0);
43629 SDValue RHS = Op.getOperand(1);
43630 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43631 Depth + 1))
43632 return true;
43633
43634 // Fold shift(0,x) -> 0
43635 if (DemandedElts.isSubsetOf(LHSZero))
43636 return TLO.CombineTo(
43637 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43638
43639 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43640 Depth + 1))
43641 return true;
43642
43643 KnownZero = LHSZero;
43644 break;
43645 }
43646 case X86ISD::CMPM:
43647 case X86ISD::CMPP: {
43648 // Scalarize packed fp comparison if we only require element 0.
43649 if (DemandedElts == 1) {
43650 SDLoc dl(Op);
43651 MVT VT = Op.getSimpleValueType();
43652 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43653 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43654 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43655 SDValue CC = Op.getOperand(2);
43656 if (Opc == X86ISD::CMPM) {
43657 SDValue Cmp =
43658 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43659 return TLO.CombineTo(
43660 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43661 }
43662 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43663 return TLO.CombineTo(Op,
43664 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43665 }
43666 break;
43667 }
43668 case X86ISD::PCMPEQ:
43669 case X86ISD::PCMPGT: {
43670 APInt LHSUndef, LHSZero;
43671 APInt RHSUndef, RHSZero;
43672 SDValue LHS = Op.getOperand(0);
43673 SDValue RHS = Op.getOperand(1);
43674 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43675 Depth + 1))
43676 return true;
43677 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43678 Depth + 1))
43679 return true;
43680 break;
43681 }
43682 case X86ISD::KSHIFTL: {
43683 SDValue Src = Op.getOperand(0);
43684 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43685 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43686 unsigned ShiftAmt = Amt->getZExtValue();
43687
43688 if (ShiftAmt == 0)
43689 return TLO.CombineTo(Op, Src);
43690
43691 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43692 // single shift. We can do this if the bottom bits (which are shifted
43693 // out) are never demanded.
43694 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43695 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43696 unsigned C1 = Src.getConstantOperandVal(1);
43697 unsigned NewOpc = X86ISD::KSHIFTL;
43698 int Diff = ShiftAmt - C1;
43699 if (Diff < 0) {
43700 Diff = -Diff;
43701 NewOpc = X86ISD::KSHIFTR;
43702 }
43703
43704 SDLoc dl(Op);
43705 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43706 return TLO.CombineTo(
43707 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43708 }
43709 }
43710
43711 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43712 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43713 Depth + 1))
43714 return true;
43715
43716 KnownUndef <<= ShiftAmt;
43717 KnownZero <<= ShiftAmt;
43718 KnownZero.setLowBits(ShiftAmt);
43719 break;
43720 }
43721 case X86ISD::KSHIFTR: {
43722 SDValue Src = Op.getOperand(0);
43723 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43724 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43725 unsigned ShiftAmt = Amt->getZExtValue();
43726
43727 if (ShiftAmt == 0)
43728 return TLO.CombineTo(Op, Src);
43729
43730 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43731 // single shift. We can do this if the top bits (which are shifted
43732 // out) are never demanded.
43733 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43734 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43735 unsigned C1 = Src.getConstantOperandVal(1);
43736 unsigned NewOpc = X86ISD::KSHIFTR;
43737 int Diff = ShiftAmt - C1;
43738 if (Diff < 0) {
43739 Diff = -Diff;
43740 NewOpc = X86ISD::KSHIFTL;
43741 }
43742
43743 SDLoc dl(Op);
43744 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43745 return TLO.CombineTo(
43746 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43747 }
43748 }
43749
43750 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43751 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43752 Depth + 1))
43753 return true;
43754
43755 KnownUndef.lshrInPlace(ShiftAmt);
43756 KnownZero.lshrInPlace(ShiftAmt);
43757 KnownZero.setHighBits(ShiftAmt);
43758 break;
43759 }
43760 case X86ISD::ANDNP: {
43761 // ANDNP = (~LHS & RHS);
43762 SDValue LHS = Op.getOperand(0);
43763 SDValue RHS = Op.getOperand(1);
43764
43765 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43766 APInt UndefElts;
43767 SmallVector<APInt> EltBits;
43768 int NumElts = VT.getVectorNumElements();
43769 int EltSizeInBits = VT.getScalarSizeInBits();
43770 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43771 APInt OpElts = DemandedElts;
43772 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43773 EltBits)) {
43774 OpBits.clearAllBits();
43775 OpElts.clearAllBits();
43776 for (int I = 0; I != NumElts; ++I) {
43777 if (!DemandedElts[I])
43778 continue;
43779 if (UndefElts[I]) {
43780 // We can't assume an undef src element gives an undef dst - the
43781 // other src might be zero.
43782 OpBits.setAllBits();
43783 OpElts.setBit(I);
43784 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43785 (!Invert && !EltBits[I].isZero())) {
43786 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43787 OpElts.setBit(I);
43788 }
43789 }
43790 }
43791 return std::make_pair(OpBits, OpElts);
43792 };
43793 APInt BitsLHS, EltsLHS;
43794 APInt BitsRHS, EltsRHS;
43795 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43796 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43797
43798 APInt LHSUndef, LHSZero;
43799 APInt RHSUndef, RHSZero;
43800 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43801 Depth + 1))
43802 return true;
43803 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43804 Depth + 1))
43805 return true;
43806
43807 if (!DemandedElts.isAllOnes()) {
43808 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43809 TLO.DAG, Depth + 1);
43810 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43811 TLO.DAG, Depth + 1);
43812 if (NewLHS || NewRHS) {
43813 NewLHS = NewLHS ? NewLHS : LHS;
43814 NewRHS = NewRHS ? NewRHS : RHS;
43815 return TLO.CombineTo(
43816 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43817 }
43818 }
43819 break;
43820 }
43821 case X86ISD::CVTSI2P:
43822 case X86ISD::CVTUI2P:
43823 case X86ISD::CVTPH2PS:
43824 case X86ISD::CVTPS2PH: {
43825 SDValue Src = Op.getOperand(0);
43826 EVT SrcVT = Src.getValueType();
43827 APInt SrcUndef, SrcZero;
43828 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43829 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43830 Depth + 1))
43831 return true;
43832 break;
43833 }
43834 case X86ISD::PACKSS:
43835 case X86ISD::PACKUS: {
43836 SDValue N0 = Op.getOperand(0);
43837 SDValue N1 = Op.getOperand(1);
43838
43839 APInt DemandedLHS, DemandedRHS;
43840 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43841
43842 APInt LHSUndef, LHSZero;
43843 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43844 Depth + 1))
43845 return true;
43846 APInt RHSUndef, RHSZero;
43847 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43848 Depth + 1))
43849 return true;
43850
43851 // TODO - pass on known zero/undef.
43852
43853 // Aggressively peek through ops to get at the demanded elts.
43854 // TODO - we should do this for all target/faux shuffles ops.
43855 if (!DemandedElts.isAllOnes()) {
43856 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43857 TLO.DAG, Depth + 1);
43858 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43859 TLO.DAG, Depth + 1);
43860 if (NewN0 || NewN1) {
43861 NewN0 = NewN0 ? NewN0 : N0;
43862 NewN1 = NewN1 ? NewN1 : N1;
43863 return TLO.CombineTo(Op,
43864 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43865 }
43866 }
43867 break;
43868 }
43869 case X86ISD::HADD:
43870 case X86ISD::HSUB:
43871 case X86ISD::FHADD:
43872 case X86ISD::FHSUB: {
43873 SDValue N0 = Op.getOperand(0);
43874 SDValue N1 = Op.getOperand(1);
43875
43876 APInt DemandedLHS, DemandedRHS;
43877 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43878
43879 APInt LHSUndef, LHSZero;
43880 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43881 Depth + 1))
43882 return true;
43883 APInt RHSUndef, RHSZero;
43884 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43885 Depth + 1))
43886 return true;
43887
43888 // TODO - pass on known zero/undef.
43889
43890 // Aggressively peek through ops to get at the demanded elts.
43891 // TODO: Handle repeated operands.
43892 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43893 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43894 TLO.DAG, Depth + 1);
43895 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43896 TLO.DAG, Depth + 1);
43897 if (NewN0 || NewN1) {
43898 NewN0 = NewN0 ? NewN0 : N0;
43899 NewN1 = NewN1 ? NewN1 : N1;
43900 return TLO.CombineTo(Op,
43901 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43902 }
43903 }
43904 break;
43905 }
43906 case X86ISD::VTRUNC:
43907 case X86ISD::VTRUNCS:
43908 case X86ISD::VTRUNCUS: {
43909 SDValue Src = Op.getOperand(0);
43910 MVT SrcVT = Src.getSimpleValueType();
43911 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43912 APInt SrcUndef, SrcZero;
43913 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43914 Depth + 1))
43915 return true;
43916 KnownZero = SrcZero.zextOrTrunc(NumElts);
43917 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43918 break;
43919 }
43920 case X86ISD::BLENDI: {
43921 SmallVector<int, 16> BlendMask;
43922 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43924 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43925 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43926 return TLO.CombineTo(Op, R);
43927 break;
43928 }
43929 case X86ISD::BLENDV: {
43930 APInt SelUndef, SelZero;
43931 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43932 SelZero, TLO, Depth + 1))
43933 return true;
43934
43935 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43936 APInt LHSUndef, LHSZero;
43937 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43938 LHSZero, TLO, Depth + 1))
43939 return true;
43940
43941 APInt RHSUndef, RHSZero;
43942 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43943 RHSZero, TLO, Depth + 1))
43944 return true;
43945
43946 KnownZero = LHSZero & RHSZero;
43947 KnownUndef = LHSUndef & RHSUndef;
43948 break;
43949 }
43950 case X86ISD::VZEXT_MOVL: {
43951 // If upper demanded elements are already zero then we have nothing to do.
43952 SDValue Src = Op.getOperand(0);
43953 APInt DemandedUpperElts = DemandedElts;
43954 DemandedUpperElts.clearLowBits(1);
43955 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43956 return TLO.CombineTo(Op, Src);
43957 break;
43958 }
43959 case X86ISD::VZEXT_LOAD: {
43960 // If upper demanded elements are not demanded then simplify to a
43961 // scalar_to_vector(load()).
43963 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43964 SDLoc DL(Op);
43965 auto *Mem = cast<MemSDNode>(Op);
43966 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43967 Mem->getMemOperand());
43968 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43969 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43970 }
43971 break;
43972 }
43973 case X86ISD::VBROADCAST: {
43974 SDValue Src = Op.getOperand(0);
43975 MVT SrcVT = Src.getSimpleValueType();
43976 // Don't bother broadcasting if we just need the 0'th element.
43977 if (DemandedElts == 1) {
43978 if (!SrcVT.isVector())
43979 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43980 else if (Src.getValueType() != VT)
43981 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43982 SDLoc(Op));
43983 return TLO.CombineTo(Op, Src);
43984 }
43985 if (!SrcVT.isVector())
43986 break;
43987 APInt SrcUndef, SrcZero;
43988 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43989 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43990 Depth + 1))
43991 return true;
43992 // Aggressively peek through src to get at the demanded elt.
43993 // TODO - we should do this for all target/faux shuffles ops.
43995 Src, SrcElts, TLO.DAG, Depth + 1))
43996 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43997 break;
43998 }
43999 case X86ISD::VPERMV:
44000 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44001 Depth))
44002 return true;
44003 break;
44004 case X86ISD::PSHUFB:
44005 case X86ISD::VPERMV3:
44006 case X86ISD::VPERMILPV:
44007 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44008 Depth))
44009 return true;
44010 break;
44011 case X86ISD::VPPERM:
44012 case X86ISD::VPERMIL2:
44013 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44014 Depth))
44015 return true;
44016 break;
44017 }
44018
44019 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44020 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44021 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44022 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44023 DemandedElts.lshr(NumElts / 2) == 0) {
44024 unsigned SizeInBits = VT.getSizeInBits();
44025 unsigned ExtSizeInBits = SizeInBits / 2;
44026
44027 // See if 512-bit ops only use the bottom 128-bits.
44028 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44029 ExtSizeInBits = SizeInBits / 4;
44030
44031 switch (Opc) {
44032 // Scalar broadcast.
44033 case X86ISD::VBROADCAST: {
44034 SDLoc DL(Op);
44035 SDValue Src = Op.getOperand(0);
44036 if (Src.getValueSizeInBits() > ExtSizeInBits)
44037 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44038 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44039 ExtSizeInBits / VT.getScalarSizeInBits());
44040 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44041 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44042 TLO.DAG, DL, ExtSizeInBits));
44043 }
44045 SDLoc DL(Op);
44046 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44047 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44048 ExtSizeInBits / VT.getScalarSizeInBits());
44049 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44050 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44051 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44052 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44053 MemIntr->getMemOperand());
44055 Bcst.getValue(1));
44056 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44057 TLO.DAG, DL, ExtSizeInBits));
44058 }
44059 // Subvector broadcast.
44061 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44062 EVT MemVT = MemIntr->getMemoryVT();
44063 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44064 SDLoc DL(Op);
44065 SDValue Ld =
44066 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44067 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44069 Ld.getValue(1));
44070 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44071 TLO.DAG, DL, ExtSizeInBits));
44072 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44073 SDLoc DL(Op);
44074 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44075 ExtSizeInBits / VT.getScalarSizeInBits());
44076 if (SDValue BcstLd =
44077 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44078 return TLO.CombineTo(Op,
44079 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44080 TLO.DAG, DL, ExtSizeInBits));
44081 }
44082 break;
44083 }
44084 // Byte shifts by immediate.
44085 case X86ISD::VSHLDQ:
44086 case X86ISD::VSRLDQ:
44087 // Shift by uniform.
44088 case X86ISD::VSHL:
44089 case X86ISD::VSRL:
44090 case X86ISD::VSRA:
44091 // Shift by immediate.
44092 case X86ISD::VSHLI:
44093 case X86ISD::VSRLI:
44094 case X86ISD::VSRAI: {
44095 SDLoc DL(Op);
44096 SDValue Ext0 =
44097 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44098 SDValue ExtOp =
44099 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44100 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44101 SDValue Insert =
44102 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44103 return TLO.CombineTo(Op, Insert);
44104 }
44105 case X86ISD::VPERMI: {
44106 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44107 // TODO: This should be done in shuffle combining.
44108 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44110 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44111 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44112 SDLoc DL(Op);
44113 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44114 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44115 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44116 return TLO.CombineTo(Op, Insert);
44117 }
44118 }
44119 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44120 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44121 SDLoc DL(Op);
44122 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44123 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44124 Op.getOperand(1));
44125 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44126 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44127 return TLO.CombineTo(Op, Insert);
44128 }
44129 break;
44130 }
44131 case X86ISD::VPERMV: {
44134 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44135 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44136 VT == MVT::v16f32) &&
44137 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44138 // For lane-crossing shuffles, only split in half in case we're still
44139 // referencing higher elements.
44140 unsigned HalfElts = NumElts / 2;
44141 unsigned HalfSize = SizeInBits / 2;
44142 Mask.resize(HalfElts);
44143 if (all_of(Mask,
44144 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44146 SDLoc DL(Op);
44147 SDValue Ext;
44148 SDValue M =
44149 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44150 SDValue V =
44151 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44152 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44153 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44154 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44155 else {
44157 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44158 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44159 TLO.DAG.getBitcast(ShufVT, V), M);
44160 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44161 }
44162 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44163 Subtarget, TLO.DAG, DL, SizeInBits);
44164 return TLO.CombineTo(Op, Insert);
44165 }
44166 }
44167 break;
44168 }
44169 case X86ISD::VPERMV3: {
44172 if (Subtarget.hasVLX() &&
44173 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44174 // For lane-crossing shuffles, only split in half in case we're still
44175 // referencing higher elements.
44176 unsigned HalfElts = NumElts / 2;
44177 unsigned HalfSize = SizeInBits / 2;
44178 Mask.resize(HalfElts);
44179 if (all_of(Mask, [&](int M) {
44180 return isUndefOrInRange(M, 0, HalfElts) ||
44181 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44182 })) {
44183 // Adjust mask elements for 2nd operand to point to half width.
44184 for (int &M : Mask)
44185 M = (M < NumElts) ? M : (M - HalfElts);
44187 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44188 SDLoc DL(Op);
44189 SDValue Ext = TLO.DAG.getNode(
44190 Opc, DL, HalfVT,
44191 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44192 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44193 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44194 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44195 Subtarget, TLO.DAG, DL, SizeInBits);
44196 return TLO.CombineTo(Op, Insert);
44197 }
44198 }
44199 break;
44200 }
44201 case X86ISD::VPERM2X128: {
44202 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44203 SDLoc DL(Op);
44204 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44205 if (LoMask & 0x8)
44206 return TLO.CombineTo(
44207 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44208 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44209 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44210 SDValue ExtOp =
44211 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44212 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44213 SDValue Insert =
44214 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44215 return TLO.CombineTo(Op, Insert);
44216 }
44217 // Conversions.
44218 // TODO: Add more CVT opcodes when we have test coverage.
44219 case X86ISD::CVTTP2UI: {
44220 if (!Subtarget.hasVLX())
44221 break;
44222 [[fallthrough]];
44223 }
44224 case X86ISD::CVTTP2SI: {
44225 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44226 !Subtarget.hasVLX())
44227 break;
44228 [[fallthrough]];
44229 }
44230 case X86ISD::CVTPH2PS: {
44231 SDLoc DL(Op);
44232 unsigned Scale = SizeInBits / ExtSizeInBits;
44233 SDValue SrcOp = Op.getOperand(0);
44234 MVT SrcVT = SrcOp.getSimpleValueType();
44235 unsigned SrcExtSize =
44236 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44238 ExtSizeInBits / VT.getScalarSizeInBits());
44239 SDValue ExtOp = TLO.DAG.getNode(
44240 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44241 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44242 SDValue Insert =
44243 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44244 return TLO.CombineTo(Op, Insert);
44245 }
44246 // Zero upper elements.
44247 case X86ISD::VZEXT_MOVL:
44248 // Variable blend.
44249 case X86ISD::BLENDV:
44250 // Target unary shuffles:
44251 case X86ISD::MOVDDUP:
44252 // Target unary shuffles by immediate:
44253 case X86ISD::PSHUFD:
44254 case X86ISD::PSHUFLW:
44255 case X86ISD::PSHUFHW:
44256 case X86ISD::VPERMILPI:
44257 // (Non-Lane Crossing) Target Shuffles.
44258 case X86ISD::VPERMILPV:
44259 case X86ISD::VPERMIL2:
44260 case X86ISD::PSHUFB:
44261 case X86ISD::UNPCKL:
44262 case X86ISD::UNPCKH:
44263 case X86ISD::BLENDI:
44264 // Integer ops.
44265 case X86ISD::PACKSS:
44266 case X86ISD::PACKUS:
44267 case X86ISD::PCMPEQ:
44268 case X86ISD::PCMPGT:
44269 case X86ISD::PMULUDQ:
44270 case X86ISD::PMULDQ:
44271 case X86ISD::VSHLV:
44272 case X86ISD::VSRLV:
44273 case X86ISD::VSRAV:
44274 // Float ops.
44275 case X86ISD::FMAX:
44276 case X86ISD::FMIN:
44277 case X86ISD::FMAXC:
44278 case X86ISD::FMINC:
44279 case X86ISD::FRSQRT:
44280 case X86ISD::FRCP:
44281 // Horizontal Ops.
44282 case X86ISD::HADD:
44283 case X86ISD::HSUB:
44284 case X86ISD::FHADD:
44285 case X86ISD::FHSUB: {
44286 SDLoc DL(Op);
44288 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44289 SDValue SrcOp = Op.getOperand(i);
44290 EVT SrcVT = SrcOp.getValueType();
44291 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44292 "Unsupported vector size");
44293 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44294 ExtSizeInBits)
44295 : SrcOp);
44296 }
44297 MVT ExtVT = VT.getSimpleVT();
44298 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44299 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44300 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44301 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44302 SDValue Insert =
44303 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44304 return TLO.CombineTo(Op, Insert);
44305 }
44306 }
44307 }
44308
44309 // For splats, unless we *only* demand the 0'th element,
44310 // stop attempts at simplification here, we aren't going to improve things,
44311 // this is better than any potential shuffle.
44312 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44313 return false;
44314
44315 // Get target/faux shuffle mask.
44316 APInt OpUndef, OpZero;
44317 SmallVector<int, 64> OpMask;
44318 SmallVector<SDValue, 2> OpInputs;
44319 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44320 OpZero, TLO.DAG, Depth, false))
44321 return false;
44322
44323 // Shuffle inputs must be the same size as the result.
44324 if (OpMask.size() != (unsigned)NumElts ||
44325 llvm::any_of(OpInputs, [VT](SDValue V) {
44326 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44327 !V.getValueType().isVector();
44328 }))
44329 return false;
44330
44331 KnownZero = OpZero;
44332 KnownUndef = OpUndef;
44333
44334 // Check if shuffle mask can be simplified to undef/zero/identity.
44335 int NumSrcs = OpInputs.size();
44336 for (int i = 0; i != NumElts; ++i)
44337 if (!DemandedElts[i])
44338 OpMask[i] = SM_SentinelUndef;
44339
44340 if (isUndefInRange(OpMask, 0, NumElts)) {
44341 KnownUndef.setAllBits();
44342 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44343 }
44344 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44345 KnownZero.setAllBits();
44346 return TLO.CombineTo(
44347 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44348 }
44349 for (int Src = 0; Src != NumSrcs; ++Src)
44350 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44351 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44352
44353 // Attempt to simplify inputs.
44354 for (int Src = 0; Src != NumSrcs; ++Src) {
44355 // TODO: Support inputs of different types.
44356 if (OpInputs[Src].getValueType() != VT)
44357 continue;
44358
44359 int Lo = Src * NumElts;
44360 APInt SrcElts = APInt::getZero(NumElts);
44361 for (int i = 0; i != NumElts; ++i)
44362 if (DemandedElts[i]) {
44363 int M = OpMask[i] - Lo;
44364 if (0 <= M && M < NumElts)
44365 SrcElts.setBit(M);
44366 }
44367
44368 // TODO - Propagate input undef/zero elts.
44369 APInt SrcUndef, SrcZero;
44370 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44371 TLO, Depth + 1))
44372 return true;
44373 }
44374
44375 // If we don't demand all elements, then attempt to combine to a simpler
44376 // shuffle.
44377 // We need to convert the depth to something combineX86ShufflesRecursively
44378 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44379 // to match. This prevents combineX86ShuffleChain from returning a
44380 // combined shuffle that's the same as the original root, causing an
44381 // infinite loop.
44382 if (!DemandedElts.isAllOnes()) {
44383 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44384
44385 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44386 for (int i = 0; i != NumElts; ++i)
44387 if (DemandedElts[i])
44388 DemandedMask[i] = i;
44389
44391 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44393 /*AllowVariableCrossLaneMask=*/true,
44394 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44395 TLO.DAG, SDLoc(Op), Subtarget);
44396 if (NewShuffle)
44397 return TLO.CombineTo(Op, NewShuffle);
44398 }
44399
44400 return false;
44401}
44402
44404 SDValue Op, const APInt &OriginalDemandedBits,
44405 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44406 unsigned Depth) const {
44407 EVT VT = Op.getValueType();
44408 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44409 unsigned Opc = Op.getOpcode();
44410 switch(Opc) {
44411 case X86ISD::VTRUNC: {
44412 KnownBits KnownOp;
44413 SDValue Src = Op.getOperand(0);
44414 MVT SrcVT = Src.getSimpleValueType();
44415
44416 // Simplify the input, using demanded bit information.
44417 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44418 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44419 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44420 return true;
44421 break;
44422 }
44423 case X86ISD::PMULDQ:
44424 case X86ISD::PMULUDQ: {
44425 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44426 KnownBits KnownLHS, KnownRHS;
44427 SDValue LHS = Op.getOperand(0);
44428 SDValue RHS = Op.getOperand(1);
44429
44430 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44431 // FIXME: Can we bound this better?
44432 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44433 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44434 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44435
44436 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44437 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44438 DemandedMaskLHS = DemandedMask;
44439 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44440 DemandedMaskRHS = DemandedMask;
44441
44442 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44443 KnownLHS, TLO, Depth + 1))
44444 return true;
44445 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44446 KnownRHS, TLO, Depth + 1))
44447 return true;
44448
44449 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44450 KnownRHS = KnownRHS.trunc(32);
44451 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44452 KnownRHS.getConstant().isOne()) {
44453 SDLoc DL(Op);
44454 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44455 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44456 }
44457
44458 // Aggressively peek through ops to get at the demanded low bits.
44460 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44462 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44463 if (DemandedLHS || DemandedRHS) {
44464 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44465 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44466 return TLO.CombineTo(
44467 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44468 }
44469 break;
44470 }
44471 case X86ISD::ANDNP: {
44472 KnownBits Known2;
44473 SDValue Op0 = Op.getOperand(0);
44474 SDValue Op1 = Op.getOperand(1);
44475
44476 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44477 Known, TLO, Depth + 1))
44478 return true;
44479
44480 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44481 OriginalDemandedElts, Known2, TLO, Depth + 1))
44482 return true;
44483
44484 // If the RHS is a constant, see if we can simplify it.
44485 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44486 OriginalDemandedElts, TLO))
44487 return true;
44488
44489 // ANDNP = (~Op0 & Op1);
44490 Known.One &= Known2.Zero;
44491 Known.Zero |= Known2.One;
44492 break;
44493 }
44494 case X86ISD::VSHLI: {
44495 SDValue Op0 = Op.getOperand(0);
44496 SDValue Op1 = Op.getOperand(1);
44497
44498 unsigned ShAmt = Op1->getAsZExtVal();
44499 if (ShAmt >= BitWidth)
44500 break;
44501
44502 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44503
44504 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44505 // single shift. We can do this if the bottom bits (which are shifted
44506 // out) are never demanded.
44507 if (Op0.getOpcode() == X86ISD::VSRLI &&
44508 OriginalDemandedBits.countr_zero() >= ShAmt) {
44509 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44510 if (Shift2Amt < BitWidth) {
44511 int Diff = ShAmt - Shift2Amt;
44512 if (Diff == 0)
44513 return TLO.CombineTo(Op, Op0.getOperand(0));
44514
44515 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44516 SDValue NewShift = TLO.DAG.getNode(
44517 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44518 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44519 return TLO.CombineTo(Op, NewShift);
44520 }
44521 }
44522
44523 // If we are only demanding sign bits then we can use the shift source directly.
44524 unsigned NumSignBits =
44525 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44526 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44527 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44528 return TLO.CombineTo(Op, Op0);
44529
44530 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44531 TLO, Depth + 1))
44532 return true;
44533
44534 Known <<= ShAmt;
44535
44536 // Low bits known zero.
44537 Known.Zero.setLowBits(ShAmt);
44538
44539 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44540 // Attempt to avoid multi-use ops if we don't need anything from them.
44541 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44542 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44543 SDValue NewOp =
44544 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44545 return TLO.CombineTo(Op, NewOp);
44546 }
44547 }
44548 return false;
44549 }
44550 case X86ISD::VSRLI: {
44551 SDValue Op0 = Op.getOperand(0);
44552 SDValue Op1 = Op.getOperand(1);
44553
44554 unsigned ShAmt = Op1->getAsZExtVal();
44555 if (ShAmt >= BitWidth)
44556 break;
44557
44558 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44559
44560 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44561 TLO, Depth + 1))
44562 return true;
44563
44564 Known >>= ShAmt;
44565
44566 // High bits known zero.
44567 Known.Zero.setHighBits(ShAmt);
44568
44569 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44570 // Attempt to avoid multi-use ops if we don't need anything from them.
44571 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44572 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44573 SDValue NewOp =
44574 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44575 return TLO.CombineTo(Op, NewOp);
44576 }
44577 }
44578 return false;
44579 }
44580 case X86ISD::VSRAI: {
44581 SDValue Op0 = Op.getOperand(0);
44582 SDValue Op1 = Op.getOperand(1);
44583
44584 unsigned ShAmt = Op1->getAsZExtVal();
44585 if (ShAmt >= BitWidth)
44586 break;
44587
44588 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44589
44590 // If we just want the sign bit then we don't need to shift it.
44591 if (OriginalDemandedBits.isSignMask())
44592 return TLO.CombineTo(Op, Op0);
44593
44594 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44595 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44596 SDValue Op00 = Op0.getOperand(0);
44597 unsigned NumSignBits =
44598 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44599 if (ShAmt < NumSignBits)
44600 return TLO.CombineTo(Op, Op00);
44601 }
44602
44603 // If any of the demanded bits are produced by the sign extension, we also
44604 // demand the input sign bit.
44605 if (OriginalDemandedBits.countl_zero() < ShAmt)
44606 DemandedMask.setSignBit();
44607
44608 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44609 TLO, Depth + 1))
44610 return true;
44611
44612 Known >>= ShAmt;
44613
44614 // If the input sign bit is known to be zero, or if none of the top bits
44615 // are demanded, turn this into an unsigned shift right.
44616 if (Known.Zero[BitWidth - ShAmt - 1] ||
44617 OriginalDemandedBits.countl_zero() >= ShAmt)
44618 return TLO.CombineTo(
44619 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44620
44621 // High bits are known one.
44622 if (Known.One[BitWidth - ShAmt - 1])
44623 Known.One.setHighBits(ShAmt);
44624
44625 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44626 // Attempt to avoid multi-use ops if we don't need anything from them.
44627 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44628 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44629 SDValue NewOp =
44630 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44631 return TLO.CombineTo(Op, NewOp);
44632 }
44633 }
44634 return false;
44635 }
44636 case X86ISD::BLENDI: {
44637 SDValue LHS = Op.getOperand(0);
44638 SDValue RHS = Op.getOperand(1);
44639 APInt Mask = getBLENDIBlendMask(Op);
44640
44641 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44642 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44643 TLO, Depth + 1))
44644 return true;
44645
44646 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44647 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44648 TLO, Depth + 1))
44649 return true;
44650
44651 // Attempt to avoid multi-use ops if we don't need anything from them.
44653 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44655 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44656 if (NewLHS || NewRHS) {
44657 NewLHS = NewLHS ? NewLHS : LHS;
44658 NewRHS = NewRHS ? NewRHS : RHS;
44659 return TLO.CombineTo(Op,
44660 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44661 NewLHS, NewRHS, Op.getOperand(2)));
44662 }
44663 break;
44664 }
44665 case X86ISD::BLENDV: {
44666 SDValue Sel = Op.getOperand(0);
44667 SDValue LHS = Op.getOperand(1);
44668 SDValue RHS = Op.getOperand(2);
44669
44670 APInt SignMask = APInt::getSignMask(BitWidth);
44672 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44674 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44676 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44677
44678 if (NewSel || NewLHS || NewRHS) {
44679 NewSel = NewSel ? NewSel : Sel;
44680 NewLHS = NewLHS ? NewLHS : LHS;
44681 NewRHS = NewRHS ? NewRHS : RHS;
44682 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44683 NewSel, NewLHS, NewRHS));
44684 }
44685 break;
44686 }
44687 case X86ISD::PEXTRB:
44688 case X86ISD::PEXTRW: {
44689 SDValue Vec = Op.getOperand(0);
44690 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44691 MVT VecVT = Vec.getSimpleValueType();
44692 unsigned NumVecElts = VecVT.getVectorNumElements();
44693
44694 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44695 unsigned Idx = CIdx->getZExtValue();
44696 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44697
44698 // If we demand no bits from the vector then we must have demanded
44699 // bits from the implict zext - simplify to zero.
44700 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44701 if (DemandedVecBits == 0)
44702 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44703
44704 APInt KnownUndef, KnownZero;
44705 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44706 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44707 KnownZero, TLO, Depth + 1))
44708 return true;
44709
44710 KnownBits KnownVec;
44711 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44712 KnownVec, TLO, Depth + 1))
44713 return true;
44714
44716 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44717 return TLO.CombineTo(
44718 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44719
44720 Known = KnownVec.zext(BitWidth);
44721 return false;
44722 }
44723 break;
44724 }
44725 case X86ISD::PINSRB:
44726 case X86ISD::PINSRW: {
44727 SDValue Vec = Op.getOperand(0);
44728 SDValue Scl = Op.getOperand(1);
44729 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44730 MVT VecVT = Vec.getSimpleValueType();
44731
44732 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44733 unsigned Idx = CIdx->getZExtValue();
44734 if (!OriginalDemandedElts[Idx])
44735 return TLO.CombineTo(Op, Vec);
44736
44737 KnownBits KnownVec;
44738 APInt DemandedVecElts(OriginalDemandedElts);
44739 DemandedVecElts.clearBit(Idx);
44740 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44741 KnownVec, TLO, Depth + 1))
44742 return true;
44743
44744 KnownBits KnownScl;
44745 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44746 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44747 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44748 return true;
44749
44750 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44751 Known = KnownVec.intersectWith(KnownScl);
44752 return false;
44753 }
44754 break;
44755 }
44756 case X86ISD::PACKSS:
44757 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44758 // sign bit then we can just ask for the source operands sign bit.
44759 // TODO - add known bits handling.
44760 if (OriginalDemandedBits.isSignMask()) {
44761 APInt DemandedLHS, DemandedRHS;
44762 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44763
44764 KnownBits KnownLHS, KnownRHS;
44765 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44766 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44767 KnownLHS, TLO, Depth + 1))
44768 return true;
44769 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44770 KnownRHS, TLO, Depth + 1))
44771 return true;
44772
44773 // Attempt to avoid multi-use ops if we don't need anything from them.
44775 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44777 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44778 if (DemandedOp0 || DemandedOp1) {
44779 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44780 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44781 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44782 }
44783 }
44784 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44785 break;
44786 case X86ISD::VBROADCAST: {
44787 SDValue Src = Op.getOperand(0);
44788 MVT SrcVT = Src.getSimpleValueType();
44789 APInt DemandedElts = APInt::getOneBitSet(
44790 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44791 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44792 TLO, Depth + 1))
44793 return true;
44794 // If we don't need the upper bits, attempt to narrow the broadcast source.
44795 // Don't attempt this on AVX512 as it might affect broadcast folding.
44796 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44797 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44798 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44799 Src->hasOneUse()) {
44800 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44801 SDValue NewSrc =
44802 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44803 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44804 SDValue NewBcst =
44805 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44806 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44807 }
44808 break;
44809 }
44810 case X86ISD::PCMPGT:
44811 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44812 // iff we only need the sign bit then we can use R directly.
44813 if (OriginalDemandedBits.isSignMask() &&
44814 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44815 return TLO.CombineTo(Op, Op.getOperand(1));
44816 break;
44817 case X86ISD::MOVMSK: {
44818 SDValue Src = Op.getOperand(0);
44819 MVT SrcVT = Src.getSimpleValueType();
44820 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44821 unsigned NumElts = SrcVT.getVectorNumElements();
44822
44823 // If we don't need the sign bits at all just return zero.
44824 if (OriginalDemandedBits.countr_zero() >= NumElts)
44825 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44826
44827 // See if we only demand bits from the lower 128-bit vector.
44828 if (SrcVT.is256BitVector() &&
44829 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44830 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44831 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44832 }
44833
44834 // Only demand the vector elements of the sign bits we need.
44835 APInt KnownUndef, KnownZero;
44836 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44837 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44838 TLO, Depth + 1))
44839 return true;
44840
44841 Known.Zero = KnownZero.zext(BitWidth);
44842 Known.Zero.setHighBits(BitWidth - NumElts);
44843
44844 // MOVMSK only uses the MSB from each vector element.
44845 KnownBits KnownSrc;
44846 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44847 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44848 Depth + 1))
44849 return true;
44850
44851 if (KnownSrc.One[SrcBits - 1])
44852 Known.One.setLowBits(NumElts);
44853 else if (KnownSrc.Zero[SrcBits - 1])
44854 Known.Zero.setLowBits(NumElts);
44855
44856 // Attempt to avoid multi-use os if we don't need anything from it.
44858 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44859 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44860 return false;
44861 }
44862 case X86ISD::TESTP: {
44863 SDValue Op0 = Op.getOperand(0);
44864 SDValue Op1 = Op.getOperand(1);
44865 MVT OpVT = Op0.getSimpleValueType();
44866 assert((OpVT.getVectorElementType() == MVT::f32 ||
44867 OpVT.getVectorElementType() == MVT::f64) &&
44868 "Illegal vector type for X86ISD::TESTP");
44869
44870 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44871 KnownBits KnownSrc;
44872 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44873 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44874 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44875 AssumeSingleUse) ||
44876 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44877 AssumeSingleUse);
44878 }
44879 case X86ISD::CMOV: {
44880 KnownBits Known2;
44881 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44882 OriginalDemandedElts, Known2, TLO, Depth + 1))
44883 return true;
44884 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44885 OriginalDemandedElts, Known, TLO, Depth + 1))
44886 return true;
44887
44888 // Only known if known in both the LHS and RHS.
44889 Known = Known.intersectWith(Known2);
44890 return false;
44891 }
44892 case X86ISD::BEXTR:
44893 case X86ISD::BEXTRI: {
44894 SDValue Op0 = Op.getOperand(0);
44895 SDValue Op1 = Op.getOperand(1);
44896
44897 // Only bottom 16-bits of the control bits are required.
44898 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44899 // NOTE: SimplifyDemandedBits won't do this for constants.
44900 uint64_t Val1 = Cst1->getZExtValue();
44901 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44902 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44903 SDLoc DL(Op);
44904 return TLO.CombineTo(
44905 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44906 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44907 }
44908
44909 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44910 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44911
44912 // If the length is 0, the result is 0.
44913 if (Length == 0) {
44914 Known.setAllZero();
44915 return false;
44916 }
44917
44918 if ((Shift + Length) <= BitWidth) {
44919 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44920 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44921 return true;
44922
44923 Known = Known.extractBits(Length, Shift);
44924 Known = Known.zextOrTrunc(BitWidth);
44925 return false;
44926 }
44927 } else {
44928 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44929 KnownBits Known1;
44930 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44931 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44932 return true;
44933
44934 // If the length is 0, replace with 0.
44935 KnownBits LengthBits = Known1.extractBits(8, 8);
44936 if (LengthBits.isZero())
44937 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44938 }
44939
44940 break;
44941 }
44942 case X86ISD::PDEP: {
44943 SDValue Op0 = Op.getOperand(0);
44944 SDValue Op1 = Op.getOperand(1);
44945
44946 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44947 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44948
44949 // If the demanded bits has leading zeroes, we don't demand those from the
44950 // mask.
44951 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44952 return true;
44953
44954 // The number of possible 1s in the mask determines the number of LSBs of
44955 // operand 0 used. Undemanded bits from the mask don't matter so filter
44956 // them before counting.
44957 KnownBits Known2;
44958 uint64_t Count = (~Known.Zero & LoMask).popcount();
44959 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44960 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44961 return true;
44962
44963 // Zeroes are retained from the mask, but not ones.
44964 Known.One.clearAllBits();
44965 // The result will have at least as many trailing zeros as the non-mask
44966 // operand since bits can only map to the same or higher bit position.
44967 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44968 return false;
44969 }
44970 case X86ISD::VPMADD52L:
44971 case X86ISD::VPMADD52H: {
44972 KnownBits KnownOp0, KnownOp1, KnownOp2;
44973 SDValue Op0 = Op.getOperand(0);
44974 SDValue Op1 = Op.getOperand(1);
44975 SDValue Op2 = Op.getOperand(2);
44976 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
44977 // operand 2).
44978 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
44979 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
44980 TLO, Depth + 1))
44981 return true;
44982
44983 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
44984 TLO, Depth + 1))
44985 return true;
44986
44987 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
44988 KnownOp2, TLO, Depth + 1))
44989 return true;
44990
44991 KnownBits KnownMul;
44992 KnownOp0 = KnownOp0.trunc(52);
44993 KnownOp1 = KnownOp1.trunc(52);
44994 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
44995 : KnownBits::mulhu(KnownOp0, KnownOp1);
44996 KnownMul = KnownMul.zext(64);
44997
44998 // lo/hi(X * Y) + Z --> C + Z
44999 if (KnownMul.isConstant()) {
45000 SDLoc DL(Op);
45001 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45002 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45003 }
45004
45005 Known = KnownBits::add(KnownMul, KnownOp2);
45006 return false;
45007 }
45008 }
45009
45011 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45012}
45013
45015 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45016 SelectionDAG &DAG, unsigned Depth) const {
45017 int NumElts = DemandedElts.getBitWidth();
45018 unsigned Opc = Op.getOpcode();
45019 EVT VT = Op.getValueType();
45020
45021 switch (Opc) {
45022 case X86ISD::PINSRB:
45023 case X86ISD::PINSRW: {
45024 // If we don't demand the inserted element, return the base vector.
45025 SDValue Vec = Op.getOperand(0);
45026 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45027 MVT VecVT = Vec.getSimpleValueType();
45028 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45029 !DemandedElts[CIdx->getZExtValue()])
45030 return Vec;
45031 break;
45032 }
45033 case X86ISD::VSHLI: {
45034 // If we are only demanding sign bits then we can use the shift source
45035 // directly.
45036 SDValue Op0 = Op.getOperand(0);
45037 unsigned ShAmt = Op.getConstantOperandVal(1);
45038 unsigned BitWidth = DemandedBits.getBitWidth();
45039 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45040 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45041 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45042 return Op0;
45043 break;
45044 }
45045 case X86ISD::VSRAI:
45046 // iff we only need the sign bit then we can use the source directly.
45047 // TODO: generalize where we only demand extended signbits.
45048 if (DemandedBits.isSignMask())
45049 return Op.getOperand(0);
45050 break;
45051 case X86ISD::PCMPGT:
45052 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45053 // iff we only need the sign bit then we can use R directly.
45054 if (DemandedBits.isSignMask() &&
45055 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45056 return Op.getOperand(1);
45057 break;
45058 case X86ISD::BLENDV: {
45059 // BLENDV: Cond (MSB) ? LHS : RHS
45060 SDValue Cond = Op.getOperand(0);
45061 SDValue LHS = Op.getOperand(1);
45062 SDValue RHS = Op.getOperand(2);
45063
45064 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45065 if (CondKnown.isNegative())
45066 return LHS;
45067 if (CondKnown.isNonNegative())
45068 return RHS;
45069 break;
45070 }
45071 case X86ISD::ANDNP: {
45072 // ANDNP = (~LHS & RHS);
45073 SDValue LHS = Op.getOperand(0);
45074 SDValue RHS = Op.getOperand(1);
45075
45076 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45077 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45078
45079 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45080 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45081 // this context, so return RHS.
45082 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45083 return RHS;
45084 break;
45085 }
45086 }
45087
45088 APInt ShuffleUndef, ShuffleZero;
45089 SmallVector<int, 16> ShuffleMask;
45091 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45092 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45093 // If all the demanded elts are from one operand and are inline,
45094 // then we can use the operand directly.
45095 int NumOps = ShuffleOps.size();
45096 if (ShuffleMask.size() == (unsigned)NumElts &&
45098 return VT.getSizeInBits() == V.getValueSizeInBits();
45099 })) {
45100
45101 if (DemandedElts.isSubsetOf(ShuffleUndef))
45102 return DAG.getUNDEF(VT);
45103 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45104 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45105
45106 // Bitmask that indicates which ops have only been accessed 'inline'.
45107 APInt IdentityOp = APInt::getAllOnes(NumOps);
45108 for (int i = 0; i != NumElts; ++i) {
45109 int M = ShuffleMask[i];
45110 if (!DemandedElts[i] || ShuffleUndef[i])
45111 continue;
45112 int OpIdx = M / NumElts;
45113 int EltIdx = M % NumElts;
45114 if (M < 0 || EltIdx != i) {
45115 IdentityOp.clearAllBits();
45116 break;
45117 }
45118 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45119 if (IdentityOp == 0)
45120 break;
45121 }
45122 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45123 "Multiple identity shuffles detected");
45124
45125 if (IdentityOp != 0)
45126 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45127 }
45128 }
45129
45131 Op, DemandedBits, DemandedElts, DAG, Depth);
45132}
45133
45135 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45136 bool PoisonOnly, unsigned Depth) const {
45137 unsigned NumElts = DemandedElts.getBitWidth();
45138
45139 switch (Op.getOpcode()) {
45141 case X86ISD::Wrapper:
45142 case X86ISD::WrapperRIP:
45143 return true;
45144 case X86ISD::BLENDI:
45145 case X86ISD::PSHUFD:
45146 case X86ISD::UNPCKL:
45147 case X86ISD::UNPCKH:
45148 case X86ISD::VPERMILPI:
45149 case X86ISD::VPERMV3: {
45152 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45153 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45154 APInt::getZero(NumElts));
45155 for (auto M : enumerate(Mask)) {
45156 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45157 continue;
45158 if (M.value() == SM_SentinelUndef)
45159 return false;
45160 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45161 "Shuffle mask index out of range");
45162 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45163 }
45164 for (auto Op : enumerate(Ops))
45165 if (!DemandedSrcElts[Op.index()].isZero() &&
45167 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45168 return false;
45169 return true;
45170 }
45171 break;
45172 }
45173 }
45175 Op, DemandedElts, DAG, PoisonOnly, Depth);
45176}
45177
45179 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45180 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45181
45182 switch (Op.getOpcode()) {
45183 // SSE bit logic.
45184 case X86ISD::FAND:
45185 case X86ISD::FOR:
45186 case X86ISD::FXOR:
45187 case X86ISD::FANDN:
45188 case X86ISD::ANDNP:
45189 case X86ISD::VPTERNLOG:
45190 return false;
45191 // SSE vector insert/extracts use modulo indices.
45192 case X86ISD::PINSRB:
45193 case X86ISD::PINSRW:
45194 case X86ISD::PEXTRB:
45195 case X86ISD::PEXTRW:
45196 return false;
45197 // SSE vector multiplies are either inbounds or saturate.
45198 case X86ISD::VPMADDUBSW:
45199 case X86ISD::VPMADDWD:
45200 return false;
45201 // SSE vector shifts handle out of bounds shift amounts.
45202 case X86ISD::VSHLI:
45203 case X86ISD::VSRLI:
45204 case X86ISD::VSRAI:
45205 return false;
45206 // SSE blends.
45207 case X86ISD::BLENDI:
45208 case X86ISD::BLENDV:
45209 return false;
45210 // SSE target shuffles.
45211 case X86ISD::PSHUFD:
45212 case X86ISD::UNPCKL:
45213 case X86ISD::UNPCKH:
45214 case X86ISD::VPERMILPI:
45215 case X86ISD::VPERMV3:
45216 return false;
45217 // SSE comparisons handle all icmp/fcmp cases.
45218 // TODO: Add CMPM/MM with test coverage.
45219 case X86ISD::CMPP:
45220 case X86ISD::PCMPEQ:
45221 case X86ISD::PCMPGT:
45222 return false;
45223 // SSE signbit extraction.
45224 case X86ISD::MOVMSK:
45225 return false;
45226 // GFNI instructions.
45229 case X86ISD::GF2P8MULB:
45230 return false;
45232 switch (Op->getConstantOperandVal(0)) {
45233 case Intrinsic::x86_sse2_pmadd_wd:
45234 case Intrinsic::x86_avx2_pmadd_wd:
45235 case Intrinsic::x86_avx512_pmaddw_d_512:
45236 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45237 case Intrinsic::x86_avx2_pmadd_ub_sw:
45238 case Intrinsic::x86_avx512_pmaddubs_w_512:
45239 return false;
45240 case Intrinsic::x86_avx512_vpermi2var_d_128:
45241 case Intrinsic::x86_avx512_vpermi2var_d_256:
45242 case Intrinsic::x86_avx512_vpermi2var_d_512:
45243 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45244 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45245 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45246 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45247 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45248 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45249 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45250 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45251 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45252 case Intrinsic::x86_avx512_vpermi2var_q_128:
45253 case Intrinsic::x86_avx512_vpermi2var_q_256:
45254 case Intrinsic::x86_avx512_vpermi2var_q_512:
45255 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45256 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45257 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45258 return false;
45259 }
45260 }
45262 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45263}
45264
45266 const APInt &DemandedElts,
45267 APInt &UndefElts,
45268 const SelectionDAG &DAG,
45269 unsigned Depth) const {
45270 unsigned NumElts = DemandedElts.getBitWidth();
45271 unsigned Opc = Op.getOpcode();
45272
45273 switch (Opc) {
45274 case X86ISD::VBROADCAST:
45276 UndefElts = APInt::getZero(NumElts);
45277 return true;
45278 }
45279
45280 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45281 DAG, Depth);
45282}
45283
45284// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45285// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45286static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45287 bool AllowTruncate, unsigned Depth) {
45288 // Limit recursion.
45290 return false;
45291 switch (Src.getOpcode()) {
45292 case ISD::TRUNCATE:
45293 if (!AllowTruncate)
45294 return false;
45295 [[fallthrough]];
45296 case ISD::SETCC:
45297 return Src.getOperand(0).getValueSizeInBits() == Size;
45298 case ISD::FREEZE:
45299 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45300 Depth + 1);
45301 case ISD::AND:
45302 case ISD::XOR:
45303 case ISD::OR:
45304 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45305 Depth + 1) &&
45306 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45307 Depth + 1);
45308 case ISD::SELECT:
45309 case ISD::VSELECT:
45310 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45311 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45312 Depth + 1) &&
45313 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45314 Depth + 1);
45315 case ISD::BUILD_VECTOR:
45316 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45317 ISD::isBuildVectorAllOnes(Src.getNode());
45318 }
45319 return false;
45320}
45321
45322// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45323static unsigned getAltBitOpcode(unsigned Opcode) {
45324 switch(Opcode) {
45325 // clang-format off
45326 case ISD::AND: return X86ISD::FAND;
45327 case ISD::OR: return X86ISD::FOR;
45328 case ISD::XOR: return X86ISD::FXOR;
45329 case X86ISD::ANDNP: return X86ISD::FANDN;
45330 // clang-format on
45331 }
45332 llvm_unreachable("Unknown bitwise opcode");
45333}
45334
45335// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45337 const SDLoc &DL) {
45338 EVT SrcVT = Src.getValueType();
45339 if (SrcVT != MVT::v4i1)
45340 return SDValue();
45341
45342 switch (Src.getOpcode()) {
45343 case ISD::SETCC:
45344 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45345 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45346 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45347 SDValue Op0 = Src.getOperand(0);
45348 if (ISD::isNormalLoad(Op0.getNode()))
45349 return DAG.getBitcast(MVT::v4f32, Op0);
45350 if (Op0.getOpcode() == ISD::BITCAST &&
45351 Op0.getOperand(0).getValueType() == MVT::v4f32)
45352 return Op0.getOperand(0);
45353 }
45354 break;
45355 case ISD::AND:
45356 case ISD::XOR:
45357 case ISD::OR: {
45358 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45359 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45360 if (Op0 && Op1)
45361 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45362 Op1);
45363 break;
45364 }
45365 }
45366 return SDValue();
45367}
45368
45369// Helper to push sign extension of vXi1 SETCC result through bitops.
45371 SDValue Src, const SDLoc &DL) {
45372 switch (Src.getOpcode()) {
45373 case ISD::SETCC:
45374 case ISD::FREEZE:
45375 case ISD::TRUNCATE:
45376 case ISD::BUILD_VECTOR:
45377 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45378 case ISD::AND:
45379 case ISD::XOR:
45380 case ISD::OR:
45381 return DAG.getNode(
45382 Src.getOpcode(), DL, SExtVT,
45383 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45384 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45385 case ISD::SELECT:
45386 case ISD::VSELECT:
45387 return DAG.getSelect(
45388 DL, SExtVT, Src.getOperand(0),
45389 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45390 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45391 }
45392 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45393}
45394
45395// Try to match patterns such as
45396// (i16 bitcast (v16i1 x))
45397// ->
45398// (i16 movmsk (16i8 sext (v16i1 x)))
45399// before the illegal vector is scalarized on subtargets that don't have legal
45400// vxi1 types.
45402 const SDLoc &DL,
45403 const X86Subtarget &Subtarget) {
45404 EVT SrcVT = Src.getValueType();
45405 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
45406 return SDValue();
45407
45408 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45409 // legalization destroys the v4i32 type.
45410 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45411 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45412 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45413 DAG.getBitcast(MVT::v4f32, V));
45414 return DAG.getZExtOrTrunc(V, DL, VT);
45415 }
45416 }
45417
45418 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45419 // movmskb even with avx512. This will be better than truncating to vXi1 and
45420 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45421 // vpcmpeqb/vpcmpgtb.
45422 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45423 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45424 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45425 Src.getOperand(0).getValueType() == MVT::v64i8);
45426
45427 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45428 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45429 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45430 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45431 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45432 EVT CmpVT = Src.getOperand(0).getValueType();
45433 EVT EltVT = CmpVT.getVectorElementType();
45434 if (CmpVT.getSizeInBits() <= 256 &&
45435 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45436 PreferMovMsk = true;
45437 }
45438
45439 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45440 // MOVMSK is supported in SSE2 or later.
45441 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45442 return SDValue();
45443
45444 // If the upper ops of a concatenation are undef, then try to bitcast the
45445 // lower op and extend.
45446 SmallVector<SDValue, 4> SubSrcOps;
45447 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45448 SubSrcOps.size() >= 2) {
45449 SDValue LowerOp = SubSrcOps[0];
45450 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45451 if (LowerOp.getOpcode() == ISD::SETCC &&
45452 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45453 EVT SubVT = VT.getIntegerVT(
45454 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45455 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45456 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45457 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45458 }
45459 }
45460 }
45461
45462 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45463 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45464 // v8i16 and v16i16.
45465 // For these two cases, we can shuffle the upper element bytes to a
45466 // consecutive sequence at the start of the vector and treat the results as
45467 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45468 // for v16i16 this is not the case, because the shuffle is expensive, so we
45469 // avoid sign-extending to this type entirely.
45470 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45471 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45472 MVT SExtVT;
45473 bool PropagateSExt = false;
45474 switch (SrcVT.getSimpleVT().SimpleTy) {
45475 default:
45476 return SDValue();
45477 case MVT::v2i1:
45478 SExtVT = MVT::v2i64;
45479 break;
45480 case MVT::v4i1:
45481 SExtVT = MVT::v4i32;
45482 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45483 // sign-extend to a 256-bit operation to avoid truncation.
45484 if (Subtarget.hasAVX() &&
45485 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45486 SExtVT = MVT::v4i64;
45487 PropagateSExt = true;
45488 }
45489 break;
45490 case MVT::v8i1:
45491 SExtVT = MVT::v8i16;
45492 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45493 // sign-extend to a 256-bit operation to match the compare.
45494 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45495 // 256-bit because the shuffle is cheaper than sign extending the result of
45496 // the compare.
45497 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45498 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45499 SExtVT = MVT::v8i32;
45500 PropagateSExt = true;
45501 }
45502 break;
45503 case MVT::v16i1:
45504 SExtVT = MVT::v16i8;
45505 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45506 // it is not profitable to sign-extend to 256-bit because this will
45507 // require an extra cross-lane shuffle which is more expensive than
45508 // truncating the result of the compare to 128-bits.
45509 break;
45510 case MVT::v32i1:
45511 SExtVT = MVT::v32i8;
45512 break;
45513 case MVT::v64i1:
45514 // If we have AVX512F, but not AVX512BW and the input is truncated from
45515 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45516 if (Subtarget.hasAVX512()) {
45517 if (Subtarget.hasBWI())
45518 return SDValue();
45519 SExtVT = MVT::v64i8;
45520 break;
45521 }
45522 // Split if this is a <64 x i8> comparison result.
45523 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45524 SExtVT = MVT::v64i8;
45525 break;
45526 }
45527 return SDValue();
45528 };
45529
45530 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45531 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45532
45533 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45534 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45535 } else {
45536 if (SExtVT == MVT::v8i16) {
45537 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45538 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45539 }
45540 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45541 }
45542
45543 EVT IntVT =
45545 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45546 return DAG.getBitcast(VT, V);
45547}
45548
45549// Convert a vXi1 constant build vector to the same width scalar integer.
45551 EVT SrcVT = Op.getValueType();
45552 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45553 "Expected a vXi1 vector");
45555 "Expected a constant build vector");
45556
45557 APInt Imm(SrcVT.getVectorNumElements(), 0);
45558 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45559 SDValue In = Op.getOperand(Idx);
45560 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45561 Imm.setBit(Idx);
45562 }
45563 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45564 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45565}
45566
45569 const X86Subtarget &Subtarget) {
45570 using namespace SDPatternMatch;
45571 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45572
45573 if (!DCI.isBeforeLegalizeOps())
45574 return SDValue();
45575
45576 // Only do this if we have k-registers.
45577 if (!Subtarget.hasAVX512())
45578 return SDValue();
45579
45580 EVT DstVT = N->getValueType(0);
45581 SDValue Op = N->getOperand(0);
45582 EVT SrcVT = Op.getValueType();
45583
45584 // Make sure we have a bitcast between mask registers and a scalar type.
45585 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45586 DstVT.isScalarInteger()) &&
45587 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45588 SrcVT.isScalarInteger()))
45589 return SDValue();
45590
45591 SDValue LHS, RHS;
45592
45593 // Look for logic ops.
45595 return SDValue();
45596
45597 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45598 // least one of the getBitcast() will fold away).
45599 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45601 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45602 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45603
45604 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45605 // Most of these have to move a constant from the scalar domain anyway.
45608 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45609 DAG.getBitcast(DstVT, LHS), RHS);
45610 }
45611
45612 return SDValue();
45613}
45614
45616 const X86Subtarget &Subtarget) {
45617 SDLoc DL(BV);
45618 unsigned NumElts = BV->getNumOperands();
45619 SDValue Splat = BV->getSplatValue();
45620
45621 // Build MMX element from integer GPR or SSE float values.
45622 auto CreateMMXElement = [&](SDValue V) {
45623 if (V.isUndef())
45624 return DAG.getUNDEF(MVT::x86mmx);
45625 if (V.getValueType().isFloatingPoint()) {
45626 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45627 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45628 V = DAG.getBitcast(MVT::v2i64, V);
45629 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45630 }
45631 V = DAG.getBitcast(MVT::i32, V);
45632 } else {
45633 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45634 }
45635 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45636 };
45637
45638 // Convert build vector ops to MMX data in the bottom elements.
45640
45641 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45642
45643 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45644 if (Splat) {
45645 if (Splat.isUndef())
45646 return DAG.getUNDEF(MVT::x86mmx);
45647
45648 Splat = CreateMMXElement(Splat);
45649
45650 if (Subtarget.hasSSE1()) {
45651 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45652 if (NumElts == 8)
45653 Splat = DAG.getNode(
45654 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45655 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45656 TLI.getPointerTy(DAG.getDataLayout())),
45657 Splat, Splat);
45658
45659 // Use PSHUFW to repeat 16-bit elements.
45660 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45661 return DAG.getNode(
45662 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45663 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45664 TLI.getPointerTy(DAG.getDataLayout())),
45665 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45666 }
45667 Ops.append(NumElts, Splat);
45668 } else {
45669 for (unsigned i = 0; i != NumElts; ++i)
45670 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45671 }
45672
45673 // Use tree of PUNPCKLs to build up general MMX vector.
45674 while (Ops.size() > 1) {
45675 unsigned NumOps = Ops.size();
45676 unsigned IntrinOp =
45677 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45678 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45679 : Intrinsic::x86_mmx_punpcklbw));
45680 SDValue Intrin = DAG.getTargetConstant(
45681 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45682 for (unsigned i = 0; i != NumOps; i += 2)
45683 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45684 Ops[i], Ops[i + 1]);
45685 Ops.resize(NumOps / 2);
45686 }
45687
45688 return Ops[0];
45689}
45690
45691// Recursive function that attempts to find if a bool vector node was originally
45692// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45693// integer. If so, replace the scalar ops with bool vector equivalents back down
45694// the chain.
45696 SelectionDAG &DAG,
45697 const X86Subtarget &Subtarget,
45698 unsigned Depth = 0) {
45700 return SDValue(); // Limit search depth.
45701
45702 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45703 unsigned Opc = V.getOpcode();
45704 switch (Opc) {
45705 case ISD::BITCAST: {
45706 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45707 SDValue Src = V.getOperand(0);
45708 EVT SrcVT = Src.getValueType();
45709 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45710 return DAG.getBitcast(VT, Src);
45711 break;
45712 }
45713 case ISD::Constant: {
45714 auto *C = cast<ConstantSDNode>(V);
45715 if (C->isZero())
45716 return DAG.getConstant(0, DL, VT);
45717 if (C->isAllOnes())
45718 return DAG.getAllOnesConstant(DL, VT);
45719 break;
45720 }
45721 case ISD::TRUNCATE: {
45722 // If we find a suitable source, a truncated scalar becomes a subvector.
45723 SDValue Src = V.getOperand(0);
45724 EVT NewSrcVT =
45725 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45726 if (TLI.isTypeLegal(NewSrcVT))
45727 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45728 Subtarget, Depth + 1))
45729 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45730 DAG.getVectorIdxConstant(0, DL));
45731 break;
45732 }
45733 case ISD::ANY_EXTEND:
45734 case ISD::ZERO_EXTEND: {
45735 // If we find a suitable source, an extended scalar becomes a subvector.
45736 SDValue Src = V.getOperand(0);
45737 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45738 Src.getScalarValueSizeInBits());
45739 if (TLI.isTypeLegal(NewSrcVT))
45740 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45741 Subtarget, Depth + 1))
45742 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45743 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45744 : DAG.getConstant(0, DL, VT),
45745 N0, DAG.getVectorIdxConstant(0, DL));
45746 break;
45747 }
45748 case ISD::OR:
45749 case ISD::XOR: {
45750 // If we find suitable sources, we can just move the op to the vector
45751 // domain.
45752 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45753 Subtarget, Depth + 1))
45754 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45755 Subtarget, Depth + 1))
45756 return DAG.getNode(Opc, DL, VT, N0, N1);
45757 break;
45758 }
45759 case ISD::SHL: {
45760 // If we find a suitable source, a SHL becomes a KSHIFTL.
45761 SDValue Src0 = V.getOperand(0);
45762 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45763 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45764 break;
45765
45766 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45767 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45768 Depth + 1))
45769 return DAG.getNode(
45770 X86ISD::KSHIFTL, DL, VT, N0,
45771 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45772 break;
45773 }
45774 }
45775
45776 // Does the inner bitcast already exist?
45777 if (Depth > 0)
45778 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45779 return SDValue(Alt, 0);
45780
45781 return SDValue();
45782}
45783
45786 const X86Subtarget &Subtarget) {
45787 SDValue N0 = N->getOperand(0);
45788 EVT VT = N->getValueType(0);
45789 EVT SrcVT = N0.getValueType();
45790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45791
45792 // Try to match patterns such as
45793 // (i16 bitcast (v16i1 x))
45794 // ->
45795 // (i16 movmsk (16i8 sext (v16i1 x)))
45796 // before the setcc result is scalarized on subtargets that don't have legal
45797 // vxi1 types.
45798 if (DCI.isBeforeLegalize()) {
45799 SDLoc dl(N);
45800 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45801 return V;
45802
45803 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45804 // type, widen both sides to avoid a trip through memory.
45805 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45806 Subtarget.hasAVX512()) {
45807 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45808 N0 = DAG.getBitcast(MVT::v8i1, N0);
45809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45810 DAG.getVectorIdxConstant(0, dl));
45811 }
45812
45813 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45814 // type, widen both sides to avoid a trip through memory.
45815 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45816 Subtarget.hasAVX512()) {
45817 // Use zeros for the widening if we already have some zeroes. This can
45818 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45819 // stream of this.
45820 // FIXME: It might make sense to detect a concat_vectors with a mix of
45821 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45822 // a separate combine. What we can't do is canonicalize the operands of
45823 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45824 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45825 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45826 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45827 SrcVT = LastOp.getValueType();
45828 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45830 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45831 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45832 N0 = DAG.getBitcast(MVT::i8, N0);
45833 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45834 }
45835 }
45836
45837 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45838 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45839 Ops[0] = N0;
45840 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45841 N0 = DAG.getBitcast(MVT::i8, N0);
45842 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45843 }
45844 } else if (DCI.isAfterLegalizeDAG()) {
45845 // If we're bitcasting from iX to vXi1, see if the integer originally
45846 // began as a vXi1 and whether we can remove the bitcast entirely.
45847 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45848 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45849 if (SDValue V =
45850 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45851 return V;
45852 }
45853 }
45854
45855 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45856 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45857 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45858 // we can help with known bits propagation from the vXi1 domain to the
45859 // scalar domain.
45860 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45861 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45862 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45864 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45865 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45866
45867 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45868 // and the vbroadcast_load are both integer or both fp. In some cases this
45869 // will remove the bitcast entirely.
45870 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45871 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45872 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45873 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45874 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45875 // Don't swap i8/i16 since don't have fp types that size.
45876 if (MemSize >= 32) {
45877 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45878 : MVT::getIntegerVT(MemSize);
45879 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45880 : MVT::getIntegerVT(SrcVTSize);
45881 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45882
45883 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45884 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45885 SDValue ResNode =
45887 MemVT, BCast->getMemOperand());
45888 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45889 return DAG.getBitcast(VT, ResNode);
45890 }
45891 }
45892
45893 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45894 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45895 SDValue Src = peekThroughTruncates(N0);
45896 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45897 Src.getOperand(0).getValueSizeInBits() == 128 &&
45898 isNullConstant(Src.getOperand(1))) {
45899 SDLoc DL(N);
45900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45901 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45902 DAG.getVectorIdxConstant(0, DL));
45903 }
45904 }
45905
45906 // Since MMX types are special and don't usually play with other vector types,
45907 // it's better to handle them early to be sure we emit efficient code by
45908 // avoiding store-load conversions.
45909 if (VT == MVT::x86mmx) {
45910 // Detect MMX constant vectors.
45911 APInt UndefElts;
45912 SmallVector<APInt, 1> EltBits;
45913 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45914 /*AllowWholeUndefs*/ true,
45915 /*AllowPartialUndefs*/ true)) {
45916 SDLoc DL(N0);
45917 // Handle zero-extension of i32 with MOVD.
45918 if (EltBits[0].countl_zero() >= 32)
45919 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45920 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45921 // Else, bitcast to a double.
45922 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45923 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45924 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45925 }
45926
45927 // Detect bitcasts to x86mmx low word.
45928 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45929 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45930 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45931 bool LowUndef = true, AllUndefOrZero = true;
45932 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45933 SDValue Op = N0.getOperand(i);
45934 LowUndef &= Op.isUndef() || (i >= e/2);
45935 AllUndefOrZero &= isNullConstantOrUndef(Op);
45936 }
45937 if (AllUndefOrZero) {
45938 SDValue N00 = N0.getOperand(0);
45939 SDLoc dl(N00);
45940 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45941 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45942 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45943 }
45944 }
45945
45946 // Detect bitcasts of 64-bit build vectors and convert to a
45947 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45948 // lowest element.
45949 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45950 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45951 SrcVT == MVT::v8i8))
45952 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45953
45954 // Detect bitcasts between element or subvector extraction to x86mmx.
45955 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45957 isNullConstant(N0.getOperand(1))) {
45958 SDValue N00 = N0.getOperand(0);
45959 if (N00.getValueType().is128BitVector())
45960 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45961 DAG.getBitcast(MVT::v2i64, N00));
45962 }
45963
45964 // Detect bitcasts from FP_TO_SINT to x86mmx.
45965 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45966 SDLoc DL(N0);
45967 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45968 DAG.getUNDEF(MVT::v2i32));
45969 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45970 DAG.getBitcast(MVT::v2i64, Res));
45971 }
45972 }
45973
45974 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45975 // most of these to scalar anyway.
45976 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45977 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45979 return combinevXi1ConstantToInteger(N0, DAG);
45980 }
45981
45982 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
45983 VT.getVectorElementType() == MVT::i1) {
45984 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
45985 if (C->isAllOnes())
45986 return DAG.getConstant(1, SDLoc(N0), VT);
45987 if (C->isZero())
45988 return DAG.getConstant(0, SDLoc(N0), VT);
45989 }
45990 }
45991
45992 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45993 // Turn it into a sign bit compare that produces a k-register. This avoids
45994 // a trip through a GPR.
45995 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45996 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45998 unsigned NumElts = VT.getVectorNumElements();
45999 SDValue Src = N0;
46000
46001 // Peek through truncate.
46002 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46003 Src = N0.getOperand(0);
46004
46005 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46006 SDValue MovmskIn = Src.getOperand(0);
46007 MVT MovmskVT = MovmskIn.getSimpleValueType();
46008 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46009
46010 // We allow extra bits of the movmsk to be used since they are known zero.
46011 // We can't convert a VPMOVMSKB without avx512bw.
46012 if (MovMskElts <= NumElts &&
46013 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46014 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46015 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46016 SDLoc dl(N);
46017 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46018 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46019 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46020 if (EVT(CmpVT) == VT)
46021 return Cmp;
46022
46023 // Pad with zeroes up to original VT to replace the zeroes that were
46024 // being used from the MOVMSK.
46025 unsigned NumConcats = NumElts / MovMskElts;
46026 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46027 Ops[0] = Cmp;
46028 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46029 }
46030 }
46031 }
46032
46033 // Try to remove bitcasts from input and output of mask arithmetic to
46034 // remove GPR<->K-register crossings.
46035 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46036 return V;
46037
46038 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46039 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46040 SrcVT.getVectorNumElements() == 1)
46041 return N0.getOperand(1);
46042
46043 // Convert a bitcasted integer logic operation that has one bitcasted
46044 // floating-point operand into a floating-point logic operation. This may
46045 // create a load of a constant, but that is cheaper than materializing the
46046 // constant in an integer register and transferring it to an SSE register or
46047 // transferring the SSE operand to integer register and back.
46048 unsigned FPOpcode;
46049 switch (N0.getOpcode()) {
46050 // clang-format off
46051 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46052 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46053 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46054 default: return SDValue();
46055 // clang-format on
46056 }
46057
46058 // Check if we have a bitcast from another integer type as well.
46059 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46060 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46061 (Subtarget.hasFP16() && VT == MVT::f16) ||
46062 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46063 TLI.isTypeLegal(VT))))
46064 return SDValue();
46065
46066 SDValue LogicOp0 = N0.getOperand(0);
46067 SDValue LogicOp1 = N0.getOperand(1);
46068 SDLoc DL0(N0);
46069
46070 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46071 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46072 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46073 LogicOp0.getOperand(0).getValueType() == VT &&
46074 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46075 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46076 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46077 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46078 }
46079 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46080 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46081 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46082 LogicOp1.getOperand(0).getValueType() == VT &&
46083 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46084 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46085 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46086 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46087 }
46088
46089 return SDValue();
46090}
46091
46092// (mul (zext a), (sext, b))
46093static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46094 SDValue &Op1) {
46095 Op0 = Mul.getOperand(0);
46096 Op1 = Mul.getOperand(1);
46097
46098 // The operand1 should be signed extend
46099 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46100 std::swap(Op0, Op1);
46101
46102 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46103 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46104 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46105 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46106 return true;
46107
46108 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46109 return (BV && BV->isConstant());
46110 };
46111
46112 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46113 // value, we need to check Op0 is zero extended value. Op1 should be signed
46114 // value, so we just check the signed bits.
46115 if ((IsFreeTruncation(Op0) &&
46116 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46117 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46118 return true;
46119
46120 return false;
46121}
46122
46124 unsigned &LogBias, const SDLoc &DL,
46125 const X86Subtarget &Subtarget) {
46126 // Extend or truncate to MVT::i8 first.
46127 MVT Vi8VT =
46128 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46129 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46130 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46131
46132 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46133 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46134 // The src A, B element type is i8, but the dst C element type is i32.
46135 // When we calculate the reduce stage, we use src vector type vXi8 for it
46136 // so we need logbias 2 to avoid extra 2 stages.
46137 LogBias = 2;
46138
46139 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46140 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46141 RegSize = std::max(512u, RegSize);
46142
46143 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46144 // fill in the missing vector elements with 0.
46145 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46146 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46147 Ops[0] = LHS;
46148 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46149 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46150 Ops[0] = RHS;
46151 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46152
46153 // Actually build the DotProduct, split as 256/512 bits for
46154 // AVXVNNI/AVX512VNNI.
46155 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46157 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46158 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46159 };
46160 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46161 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46162
46163 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46164 DpBuilder, false);
46165}
46166
46167// Create a PSADBW given two sources representable as zexts of vXi8.
46169 const SDLoc &DL, const X86Subtarget &Subtarget) {
46170 // Find the appropriate width for the PSADBW.
46171 EVT DstVT = N0.getValueType();
46172 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46173 DstVT.getVectorElementCount());
46174 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46175
46176 // Widen the vXi8 vectors, padding with zero vector elements.
46177 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46178 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46179 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46180 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46181 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46182 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46183 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46184
46185 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46186 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46188 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46189 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46190 };
46191 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46192 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46193 PSADBWBuilder);
46194}
46195
46196// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46197// PHMINPOSUW.
46199 const X86Subtarget &Subtarget) {
46200 // Bail without SSE41.
46201 if (!Subtarget.hasSSE41())
46202 return SDValue();
46203
46204 EVT ExtractVT = Extract->getValueType(0);
46205 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46206 return SDValue();
46207
46208 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46209 ISD::NodeType BinOp;
46210 SDValue Src = DAG.matchBinOpReduction(
46211 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46212 if (!Src)
46213 return SDValue();
46214
46215 EVT SrcVT = Src.getValueType();
46216 EVT SrcSVT = SrcVT.getScalarType();
46217 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46218 return SDValue();
46219
46220 SDLoc DL(Extract);
46221 SDValue MinPos = Src;
46222
46223 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46224 while (SrcVT.getSizeInBits() > 128) {
46225 SDValue Lo, Hi;
46226 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46227 SrcVT = Lo.getValueType();
46228 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46229 }
46230 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46231 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46232 "Unexpected value type");
46233
46234 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46235 // to flip the value accordingly.
46236 SDValue Mask;
46237 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46238 if (BinOp == ISD::SMAX)
46239 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46240 else if (BinOp == ISD::SMIN)
46241 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46242 else if (BinOp == ISD::UMAX)
46243 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46244
46245 if (Mask)
46246 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46247
46248 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46249 // shuffling each upper element down and insert zeros. This means that the
46250 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46251 // ready for the PHMINPOS.
46252 if (ExtractVT == MVT::i8) {
46254 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46255 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46256 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46257 }
46258
46259 // Perform the PHMINPOS on a v8i16 vector,
46260 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46261 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46262 MinPos = DAG.getBitcast(SrcVT, MinPos);
46263
46264 if (Mask)
46265 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46266
46267 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46268 DAG.getVectorIdxConstant(0, DL));
46269}
46270
46271// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46273 const X86Subtarget &Subtarget) {
46274 // Bail without SSE2.
46275 if (!Subtarget.hasSSE2())
46276 return SDValue();
46277
46278 EVT ExtractVT = Extract->getValueType(0);
46279 unsigned BitWidth = ExtractVT.getSizeInBits();
46280 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46281 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46282 return SDValue();
46283
46284 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46285 ISD::NodeType BinOp;
46286 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46287 if (!Match && ExtractVT == MVT::i1)
46288 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46289 if (!Match)
46290 return SDValue();
46291
46292 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46293 // which we can't support here for now.
46294 if (Match.getScalarValueSizeInBits() != BitWidth)
46295 return SDValue();
46296
46297 SDValue Movmsk;
46298 SDLoc DL(Extract);
46299 EVT MatchVT = Match.getValueType();
46300 unsigned NumElts = MatchVT.getVectorNumElements();
46301 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46302 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46303 LLVMContext &Ctx = *DAG.getContext();
46304
46305 if (ExtractVT == MVT::i1) {
46306 // Special case for (pre-legalization) vXi1 reductions.
46307 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46308 return SDValue();
46309 if (Match.getOpcode() == ISD::SETCC) {
46310 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46311 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46312 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46313 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46314 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46315 X86::CondCode X86CC;
46316 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46317 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46318 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46319 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46320 DAG, X86CC))
46321 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46322 getSETCC(X86CC, V, DL, DAG));
46323 }
46324 }
46325 if (TLI.isTypeLegal(MatchVT)) {
46326 // If this is a legal AVX512 predicate type then we can just bitcast.
46327 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46328 Movmsk = DAG.getBitcast(MovmskVT, Match);
46329 } else {
46330 // Use combineBitcastvxi1 to create the MOVMSK.
46331 while (NumElts > MaxElts) {
46332 SDValue Lo, Hi;
46333 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46334 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46335 NumElts /= 2;
46336 }
46337 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46338 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46339 }
46340 if (!Movmsk)
46341 return SDValue();
46342 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46343 } else {
46344 // FIXME: Better handling of k-registers or 512-bit vectors?
46345 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46346 if (!(MatchSizeInBits == 128 ||
46347 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46348 return SDValue();
46349
46350 // Make sure this isn't a vector of 1 element. The perf win from using
46351 // MOVMSK diminishes with less elements in the reduction, but it is
46352 // generally better to get the comparison over to the GPRs as soon as
46353 // possible to reduce the number of vector ops.
46354 if (Match.getValueType().getVectorNumElements() < 2)
46355 return SDValue();
46356
46357 // Check that we are extracting a reduction of all sign bits.
46358 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46359 return SDValue();
46360
46361 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46362 SDValue Lo, Hi;
46363 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46364 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46365 MatchSizeInBits = Match.getValueSizeInBits();
46366 }
46367
46368 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46369 MVT MaskSrcVT;
46370 if (64 == BitWidth || 32 == BitWidth)
46372 MatchSizeInBits / BitWidth);
46373 else
46374 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46375
46376 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46377 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46378 NumElts = MaskSrcVT.getVectorNumElements();
46379 }
46380 assert((NumElts <= 32 || NumElts == 64) &&
46381 "Not expecting more than 64 elements");
46382
46383 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46384 if (BinOp == ISD::XOR) {
46385 // parity -> (PARITY(MOVMSK X))
46386 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46387 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46388 }
46389
46390 SDValue CmpC;
46391 ISD::CondCode CondCode;
46392 if (BinOp == ISD::OR) {
46393 // any_of -> MOVMSK != 0
46394 CmpC = DAG.getConstant(0, DL, CmpVT);
46395 CondCode = ISD::CondCode::SETNE;
46396 } else {
46397 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46398 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46399 DL, CmpVT);
46400 CondCode = ISD::CondCode::SETEQ;
46401 }
46402
46403 // The setcc produces an i8 of 0/1, so extend that to the result width and
46404 // negate to get the final 0/-1 mask value.
46405 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46406 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46407 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46408 return DAG.getNegative(Zext, DL, ExtractVT);
46409}
46410
46412 const X86Subtarget &Subtarget) {
46413 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46414 return SDValue();
46415
46416 EVT ExtractVT = Extract->getValueType(0);
46417 // Verify the type we're extracting is i32, as the output element type of
46418 // vpdpbusd is i32.
46419 if (ExtractVT != MVT::i32)
46420 return SDValue();
46421
46422 EVT VT = Extract->getOperand(0).getValueType();
46424 return SDValue();
46425
46426 // Match shuffle + add pyramid.
46427 ISD::NodeType BinOp;
46428 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46429
46430 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46431 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46432 // before adding into the accumulator.
46433 // TODO:
46434 // We also need to verify that the multiply has at least 2x the number of bits
46435 // of the input. We shouldn't match
46436 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46437 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46438 // Root = Root.getOperand(0);
46439
46440 // If there was a match, we want Root to be a mul.
46441 if (!Root || Root.getOpcode() != ISD::MUL)
46442 return SDValue();
46443
46444 // Check whether we have an extend and mul pattern
46445 SDValue LHS, RHS;
46446 if (!detectExtMul(DAG, Root, LHS, RHS))
46447 return SDValue();
46448
46449 // Create the dot product instruction.
46450 SDLoc DL(Extract);
46451 unsigned StageBias;
46452 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46453
46454 // If the original vector was wider than 4 elements, sum over the results
46455 // in the DP vector.
46456 unsigned Stages = Log2_32(VT.getVectorNumElements());
46457 EVT DpVT = DP.getValueType();
46458
46459 if (Stages > StageBias) {
46460 unsigned DpElems = DpVT.getVectorNumElements();
46461
46462 for (unsigned i = Stages - StageBias; i > 0; --i) {
46463 SmallVector<int, 16> Mask(DpElems, -1);
46464 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46465 Mask[j] = MaskEnd + j;
46466
46467 SDValue Shuffle =
46468 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46469 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46470 }
46471 }
46472
46473 // Return the lowest ExtractSizeInBits bits.
46474 EVT ResVT =
46475 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46476 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46477 DP = DAG.getBitcast(ResVT, DP);
46478 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46479 Extract->getOperand(1));
46480}
46481
46483 const X86Subtarget &Subtarget) {
46484 using namespace SDPatternMatch;
46485
46486 // PSADBW is only supported on SSE2 and up.
46487 if (!Subtarget.hasSSE2())
46488 return SDValue();
46489
46490 EVT ExtractVT = Extract->getValueType(0);
46491 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46492 ExtractVT != MVT::i64)
46493 return SDValue();
46494
46495 EVT VT = Extract->getOperand(0).getValueType();
46497 return SDValue();
46498
46499 // Match shuffle + add pyramid.
46500 ISD::NodeType BinOp;
46501 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46502 if (!Root)
46503 return SDValue();
46504
46505 // The operand is expected to be zero extended from i8.
46506 // In order to convert to i64 and above, additional any/zero/sign
46507 // extend is expected.
46508 // The zero extend from 32 bit has no mathematical effect on the result.
46509 // Also the sign extend is basically zero extend
46510 // (extends the sign bit which is zero).
46511 // So it is correct to skip the sign/zero extend instruction.
46512 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46513 Root.getOpcode() == ISD::ZERO_EXTEND ||
46514 Root.getOpcode() == ISD::ANY_EXTEND)
46515 Root = Root.getOperand(0);
46516
46517 // Check whether we have an vXi8 abdu pattern.
46518 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46519 SDValue Src0, Src1;
46520 if (!sd_match(
46521 Root,
46522 m_AnyOf(
46524 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46526 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46527 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46528 m_Abs(
46529 m_Sub(m_AllOf(m_Value(Src0),
46531 m_AllOf(m_Value(Src1),
46532 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46533 return SDValue();
46534
46535 // Create the SAD instruction.
46536 SDLoc DL(Extract);
46537 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46538
46539 // If the original vector was wider than 8 elements, sum over the results
46540 // in the SAD vector.
46541 unsigned Stages = Log2_32(VT.getVectorNumElements());
46542 EVT SadVT = SAD.getValueType();
46543 if (Stages > 3) {
46544 unsigned SadElems = SadVT.getVectorNumElements();
46545
46546 for(unsigned i = Stages - 3; i > 0; --i) {
46547 SmallVector<int, 16> Mask(SadElems, -1);
46548 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46549 Mask[j] = MaskEnd + j;
46550
46551 SDValue Shuffle =
46552 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46553 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46554 }
46555 }
46556
46557 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46558 // Return the lowest ExtractSizeInBits bits.
46559 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46560 SadVT.getSizeInBits() / ExtractSizeInBits);
46561 SAD = DAG.getBitcast(ResVT, SAD);
46562 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46563 Extract->getOperand(1));
46564}
46565
46566// If this extract is from a loaded vector value and will be used as an
46567// integer, that requires a potentially expensive XMM -> GPR transfer.
46568// Additionally, if we can convert to a scalar integer load, that will likely
46569// be folded into a subsequent integer op.
46570// Note: SrcVec might not have a VecVT type, but it must be the same size.
46571// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46572// to a single-use of the loaded vector. For the reasons above, we
46573// expect this to be profitable even if it creates an extra load.
46574static SDValue
46576 const SDLoc &dl, SelectionDAG &DAG,
46578 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46579 "Only EXTRACT_VECTOR_ELT supported so far");
46580
46581 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46582 EVT VT = N->getValueType(0);
46583
46584 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46585 return Use->getOpcode() == ISD::STORE ||
46586 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46587 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46588 });
46589
46590 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46591 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46592 VecVT.getVectorElementType() == VT &&
46593 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46594 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46595 SDValue NewPtr = TLI.getVectorElementPointer(
46596 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46597 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46598 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46599 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46600 SDValue Load =
46601 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46602 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46603 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46604 return Load;
46605 }
46606
46607 return SDValue();
46608}
46609
46610// Attempt to peek through a target shuffle and extract the scalar from the
46611// source.
46614 const X86Subtarget &Subtarget) {
46615 if (DCI.isBeforeLegalizeOps())
46616 return SDValue();
46617
46618 SDLoc dl(N);
46619 SDValue Src = N->getOperand(0);
46620 SDValue Idx = N->getOperand(1);
46621
46622 EVT VT = N->getValueType(0);
46623 EVT SrcVT = Src.getValueType();
46624 EVT SrcSVT = SrcVT.getVectorElementType();
46625 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46626 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46627
46628 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46629 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46630 return SDValue();
46631
46632 const APInt &IdxC = N->getConstantOperandAPInt(1);
46633 if (IdxC.uge(NumSrcElts))
46634 return SDValue();
46635
46636 SDValue SrcBC = peekThroughBitcasts(Src);
46637
46638 // Handle extract(bitcast(broadcast(scalar_value))).
46639 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46640 SDValue SrcOp = SrcBC.getOperand(0);
46641 EVT SrcOpVT = SrcOp.getValueType();
46642 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46643 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46644 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46645 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46646 // TODO support non-zero offsets.
46647 if (Offset == 0) {
46648 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46649 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46650 return SrcOp;
46651 }
46652 }
46653 }
46654
46655 // If we're extracting a single element from a broadcast load and there are
46656 // no other users, just create a single load.
46658 SrcBC.hasOneUse()) {
46659 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46660 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46661 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46662 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46663 SDValue Load =
46664 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46665 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46666 MemIntr->getMemOperand()->getFlags());
46667 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46668 return Load;
46669 }
46670 }
46671
46672 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46673 // TODO: Move to DAGCombine?
46674 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46675 SrcBC.getValueType().isInteger() &&
46676 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46677 SrcBC.getScalarValueSizeInBits() ==
46678 SrcBC.getOperand(0).getValueSizeInBits()) {
46679 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46680 if (IdxC.ult(Scale)) {
46681 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46682 SDValue Scl = SrcBC.getOperand(0);
46683 EVT SclVT = Scl.getValueType();
46684 if (Offset) {
46685 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46686 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46687 }
46688 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46689 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46690 return Scl;
46691 }
46692 }
46693
46694 // Handle extract(truncate(x)) for 0'th index.
46695 // TODO: Treat this as a faux shuffle?
46696 // TODO: When can we use this for general indices?
46697 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46698 (SrcVT.getSizeInBits() % 128) == 0) {
46699 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46700 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46701 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46702 Idx);
46703 }
46704
46705 // We can only legally extract other elements from 128-bit vectors and in
46706 // certain circumstances, depending on SSE-level.
46707 // TODO: Investigate float/double extraction if it will be just stored.
46708 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46709 unsigned Idx) {
46710 EVT VecSVT = VecVT.getScalarType();
46711 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46712 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46713 VecSVT == MVT::i64)) {
46714 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46715 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46716 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46717 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46718 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46719 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46720 Idx &= (NumEltsPerLane - 1);
46721 }
46722 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46723 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46724 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46725 DAG.getBitcast(VecVT, Vec),
46726 DAG.getVectorIdxConstant(Idx, dl));
46727 }
46728 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46729 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46730 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46731 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46732 DAG.getTargetConstant(Idx, dl, MVT::i8));
46733 }
46734 return SDValue();
46735 };
46736
46737 // Resolve the target shuffle inputs and mask.
46740 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46741 return SDValue();
46742
46743 // Shuffle inputs must be the same size as the result.
46744 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46745 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46746 }))
46747 return SDValue();
46748
46749 // Attempt to narrow/widen the shuffle mask to the correct size.
46750 if (Mask.size() != NumSrcElts) {
46751 if ((NumSrcElts % Mask.size()) == 0) {
46752 SmallVector<int, 16> ScaledMask;
46753 int Scale = NumSrcElts / Mask.size();
46754 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46755 Mask = std::move(ScaledMask);
46756 } else if ((Mask.size() % NumSrcElts) == 0) {
46757 // Simplify Mask based on demanded element.
46758 int ExtractIdx = (int)IdxC.getZExtValue();
46759 int Scale = Mask.size() / NumSrcElts;
46760 int Lo = Scale * ExtractIdx;
46761 int Hi = Scale * (ExtractIdx + 1);
46762 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46763 if (i < Lo || Hi <= i)
46764 Mask[i] = SM_SentinelUndef;
46765
46766 SmallVector<int, 16> WidenedMask;
46767 while (Mask.size() > NumSrcElts &&
46768 canWidenShuffleElements(Mask, WidenedMask))
46769 Mask = std::move(WidenedMask);
46770 }
46771 }
46772
46773 // If narrowing/widening failed, see if we can extract+zero-extend.
46774 int ExtractIdx;
46775 EVT ExtractVT;
46776 if (Mask.size() == NumSrcElts) {
46777 ExtractIdx = Mask[IdxC.getZExtValue()];
46778 ExtractVT = SrcVT;
46779 } else {
46780 unsigned Scale = Mask.size() / NumSrcElts;
46781 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46782 return SDValue();
46783 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46784 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46785 return SDValue();
46786 ExtractIdx = Mask[ScaledIdx];
46787 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46788 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46789 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46790 "Failed to widen vector type");
46791 }
46792
46793 // If the shuffle source element is undef/zero then we can just accept it.
46794 if (ExtractIdx == SM_SentinelUndef)
46795 return DAG.getUNDEF(VT);
46796
46797 if (ExtractIdx == SM_SentinelZero)
46798 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46799 : DAG.getConstant(0, dl, VT);
46800
46801 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46802 ExtractIdx = ExtractIdx % Mask.size();
46803 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46804 return DAG.getZExtOrTrunc(V, dl, VT);
46805
46806 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46808 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46809 return V;
46810
46811 return SDValue();
46812}
46813
46814/// Extracting a scalar FP value from vector element 0 is free, so extract each
46815/// operand first, then perform the math as a scalar op.
46817 const X86Subtarget &Subtarget,
46819 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46820 SDValue Vec = ExtElt->getOperand(0);
46821 SDValue Index = ExtElt->getOperand(1);
46822 EVT VT = ExtElt->getValueType(0);
46823 EVT VecVT = Vec.getValueType();
46824
46825 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46826 // non-zero element because the shuffle+scalar op will be cheaper?
46827 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46828 return SDValue();
46829
46830 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46831 // extract, the condition code), so deal with those as a special-case.
46832 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46833 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46834 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46835 return SDValue();
46836
46837 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46838 SDLoc DL(ExtElt);
46839 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46840 Vec.getOperand(0), Index);
46841 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46842 Vec.getOperand(1), Index);
46843 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46844 }
46845
46846 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46847 VT != MVT::f64)
46848 return SDValue();
46849
46850 // Vector FP selects don't fit the pattern of FP math ops (because the
46851 // condition has a different type and we have to change the opcode), so deal
46852 // with those here.
46853 // FIXME: This is restricted to pre type legalization. If we loosen this we
46854 // need to convert vector bool to a scalar bool.
46855 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46856 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46857 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46858 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46859 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46860 SDLoc DL(ExtElt);
46863 Vec.getOperand(0), Index);
46864 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46865 Vec.getOperand(1), Index);
46866 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46867 Vec.getOperand(2), Index);
46868 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46869 }
46870
46871 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46872 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46873 // missed load folding and fma+fneg combining.
46874 switch (Vec.getOpcode()) {
46875 case ISD::FMA: // Begin 3 operands
46876 case ISD::FMAD:
46877 case ISD::FADD: // Begin 2 operands
46878 case ISD::FSUB:
46879 case ISD::FMUL:
46880 case ISD::FDIV:
46881 case ISD::FREM:
46882 case ISD::FCOPYSIGN:
46883 case ISD::FMINNUM:
46884 case ISD::FMAXNUM:
46885 case ISD::FMINNUM_IEEE:
46886 case ISD::FMAXNUM_IEEE:
46887 case ISD::FMAXIMUM:
46888 case ISD::FMINIMUM:
46889 case ISD::FMAXIMUMNUM:
46890 case ISD::FMINIMUMNUM:
46891 case X86ISD::FMAX:
46892 case X86ISD::FMIN:
46893 case ISD::FABS: // Begin 1 operand
46894 case ISD::FSQRT:
46895 case ISD::FRINT:
46896 case ISD::FCEIL:
46897 case ISD::FTRUNC:
46898 case ISD::FNEARBYINT:
46899 case ISD::FROUNDEVEN:
46900 case ISD::FROUND:
46901 case ISD::FFLOOR:
46902 case X86ISD::FRCP:
46903 case X86ISD::FRSQRT: {
46904 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46905 SDLoc DL(ExtElt);
46907 for (SDValue Op : Vec->ops())
46908 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46909 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46910 }
46911 default:
46912 return SDValue();
46913 }
46914 llvm_unreachable("All opcodes should return within switch");
46915}
46916
46917/// Try to convert a vector reduction sequence composed of binops and shuffles
46918/// into horizontal ops.
46920 const X86Subtarget &Subtarget) {
46921 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46922
46923 // We need at least SSE2 to anything here.
46924 if (!Subtarget.hasSSE2())
46925 return SDValue();
46926
46928 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46929 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46930 if (!Rdx)
46931 return SDValue();
46932
46933 SDValue Index = ExtElt->getOperand(1);
46934 assert(isNullConstant(Index) &&
46935 "Reduction doesn't end in an extract from index 0");
46936
46937 EVT VT = ExtElt->getValueType(0);
46938 EVT VecVT = Rdx.getValueType();
46939 if (VecVT.getScalarType() != VT)
46940 return SDValue();
46941
46942 SDLoc DL(ExtElt);
46943 unsigned NumElts = VecVT.getVectorNumElements();
46944 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46945
46946 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46947 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46948 if (V.getValueType() == MVT::v4i8) {
46949 if (ZeroExtend && Subtarget.hasSSE41()) {
46950 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46951 DAG.getConstant(0, DL, MVT::v4i32),
46952 DAG.getBitcast(MVT::i32, V),
46953 DAG.getVectorIdxConstant(0, DL));
46954 return DAG.getBitcast(MVT::v16i8, V);
46955 }
46956 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46957 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46958 : DAG.getUNDEF(MVT::v4i8));
46959 }
46960 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46961 DAG.getUNDEF(MVT::v8i8));
46962 };
46963
46964 // vXi8 mul reduction - promote to vXi16 mul reduction.
46965 if (Opc == ISD::MUL) {
46966 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46967 return SDValue();
46968 if (VecVT.getSizeInBits() >= 128) {
46969 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46970 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46971 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46972 Lo = DAG.getBitcast(WideVT, Lo);
46973 Hi = DAG.getBitcast(WideVT, Hi);
46974 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46975 while (Rdx.getValueSizeInBits() > 128) {
46976 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46977 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46978 }
46979 } else {
46980 Rdx = WidenToV16I8(Rdx, false);
46981 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46982 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46983 }
46984 if (NumElts >= 8)
46985 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46986 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46987 {4, 5, 6, 7, -1, -1, -1, -1}));
46988 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46989 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46990 {2, 3, -1, -1, -1, -1, -1, -1}));
46991 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46992 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46993 {1, -1, -1, -1, -1, -1, -1, -1}));
46994 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46995 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46996 }
46997
46998 // vXi8 add reduction - sub 128-bit vector.
46999 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47000 Rdx = WidenToV16I8(Rdx, true);
47001 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47002 DAG.getConstant(0, DL, MVT::v16i8));
47003 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47004 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47005 }
47006
47007 // Must be a >=128-bit vector with pow2 elements.
47008 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47009 return SDValue();
47010
47011 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47012 if (VT == MVT::i8) {
47013 while (Rdx.getValueSizeInBits() > 128) {
47014 SDValue Lo, Hi;
47015 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47016 VecVT = Lo.getValueType();
47017 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47018 }
47019 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47020
47022 MVT::v16i8, DL, Rdx, Rdx,
47023 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47024 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47025 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47026 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47027 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47028 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47029 }
47030
47031 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47032 // If the source vector values are 0-255, then we can use PSADBW to
47033 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47034 // TODO: See if its worth avoiding vXi16/i32 truncations?
47035 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47036 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47037 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47038 Subtarget.hasAVX512())) {
47039 if (Rdx.getValueType() == MVT::v8i16) {
47040 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47041 DAG.getUNDEF(MVT::v8i16));
47042 } else {
47043 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47044 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47045 if (ByteVT.getSizeInBits() < 128)
47046 Rdx = WidenToV16I8(Rdx, true);
47047 }
47048
47049 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47050 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47052 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47053 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47054 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47055 };
47056 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47057 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47058
47059 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47060 while (Rdx.getValueSizeInBits() > 128) {
47061 SDValue Lo, Hi;
47062 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47063 VecVT = Lo.getValueType();
47064 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47065 }
47066 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47067
47068 if (NumElts > 8) {
47069 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47070 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47071 }
47072
47073 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47074 Rdx = DAG.getBitcast(VecVT, Rdx);
47075 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47076 }
47077
47078 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47079 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47080 return SDValue();
47081
47082 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47083
47084 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47085 // across the whole vector, so we need an extract + hop preliminary stage.
47086 // This is the only step where the operands of the hop are not the same value.
47087 // TODO: We could extend this to handle 512-bit or even longer vectors.
47088 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47089 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47090 unsigned NumElts = VecVT.getVectorNumElements();
47091 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47092 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47093 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47094 VecVT = Rdx.getValueType();
47095 }
47096 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47097 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47098 return SDValue();
47099
47100 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47101 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47102 for (unsigned i = 0; i != ReductionSteps; ++i)
47103 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47104
47105 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47106}
47107
47108/// Detect vector gather/scatter index generation and convert it from being a
47109/// bunch of shuffles and extracts into a somewhat faster sequence.
47110/// For i686, the best sequence is apparently storing the value and loading
47111/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47114 const X86Subtarget &Subtarget) {
47115 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47116 return NewOp;
47117
47118 SDValue InputVector = N->getOperand(0);
47119 SDValue EltIdx = N->getOperand(1);
47120 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47121
47122 EVT SrcVT = InputVector.getValueType();
47123 EVT VT = N->getValueType(0);
47124 SDLoc dl(InputVector);
47125 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47126 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47127 unsigned NumEltBits = VT.getScalarSizeInBits();
47128 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47129
47130 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47131 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47132
47133 // Integer Constant Folding.
47134 if (CIdx && VT.isInteger()) {
47135 APInt UndefVecElts;
47136 SmallVector<APInt, 16> EltBits;
47137 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47138 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47139 EltBits, /*AllowWholeUndefs*/ true,
47140 /*AllowPartialUndefs*/ false)) {
47141 uint64_t Idx = CIdx->getZExtValue();
47142 if (UndefVecElts[Idx])
47143 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47144 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47145 }
47146
47147 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47148 // Improves lowering of bool masks on rust which splits them into byte array.
47149 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47150 SDValue Src = peekThroughBitcasts(InputVector);
47151 if (Src.getValueType().getScalarType() == MVT::i1 &&
47152 TLI.isTypeLegal(Src.getValueType())) {
47153 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47154 SDValue Sub = DAG.getNode(
47155 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47156 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47157 return DAG.getBitcast(VT, Sub);
47158 }
47159 }
47160 }
47161
47162 if (IsPextr) {
47163 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47164 DCI))
47165 return SDValue(N, 0);
47166
47167 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47168 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47169 InputVector.getOpcode() == X86ISD::PINSRW) &&
47170 InputVector.getOperand(2) == EltIdx) {
47171 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47172 "Vector type mismatch");
47173 SDValue Scl = InputVector.getOperand(1);
47174 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47175 return DAG.getZExtOrTrunc(Scl, dl, VT);
47176 }
47177
47178 // TODO - Remove this once we can handle the implicit zero-extension of
47179 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47180 // combineBasicSADPattern.
47181 return SDValue();
47182 }
47183
47184 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47185 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47186 InputVector.getOpcode() == ISD::BITCAST &&
47187 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47188 isNullConstant(EltIdx) && InputVector.hasOneUse())
47189 return DAG.getBitcast(VT, InputVector);
47190
47191 // Detect mmx to i32 conversion through a v2i32 elt extract.
47192 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47193 InputVector.getOpcode() == ISD::BITCAST &&
47194 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47195 isNullConstant(EltIdx) && InputVector.hasOneUse())
47196 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47197 InputVector.getOperand(0));
47198
47199 // Check whether this extract is the root of a sum of absolute differences
47200 // pattern. This has to be done here because we really want it to happen
47201 // pre-legalization,
47202 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47203 return SAD;
47204
47205 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47206 return VPDPBUSD;
47207
47208 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47209 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47210 return Cmp;
47211
47212 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47213 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47214 return MinMax;
47215
47216 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47217 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47218 return V;
47219
47220 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47221 return V;
47222
47223 if (CIdx)
47225 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47226 dl, DAG, DCI))
47227 return V;
47228
47229 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47230 // and then testing the relevant element.
47231 //
47232 // Note that we only combine extracts on the *same* result number, i.e.
47233 // t0 = merge_values a0, a1, a2, a3
47234 // i1 = extract_vector_elt t0, Constant:i64<2>
47235 // i1 = extract_vector_elt t0, Constant:i64<3>
47236 // but not
47237 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47238 // since the latter would need its own MOVMSK.
47239 if (SrcVT.getScalarType() == MVT::i1) {
47240 bool IsVar = !CIdx;
47241 SmallVector<SDNode *, 16> BoolExtracts;
47242 unsigned ResNo = InputVector.getResNo();
47243 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47244 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47245 Use->getOperand(0).getResNo() == ResNo &&
47246 Use->getValueType(0) == MVT::i1) {
47247 BoolExtracts.push_back(Use);
47248 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47249 return true;
47250 }
47251 return false;
47252 };
47253 // TODO: Can we drop the oneuse check for constant extracts?
47254 if (all_of(InputVector->users(), IsBoolExtract) &&
47255 (IsVar || BoolExtracts.size() > 1)) {
47256 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47257 if (SDValue BC =
47258 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47259 for (SDNode *Use : BoolExtracts) {
47260 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47261 // Mask = 1 << MaskIdx
47262 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47263 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47264 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47265 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47266 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47267 DCI.CombineTo(Use, Res);
47268 }
47269 return SDValue(N, 0);
47270 }
47271 }
47272 }
47273
47274 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47275 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47276 SDValue TruncSrc = InputVector.getOperand(0);
47277 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47278 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47279 SDValue NewExt =
47280 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47281 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47282 }
47283 }
47284
47285 return SDValue();
47286}
47287
47288// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47289// This is more or less the reverse of combineBitcastvxi1.
47291 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47292 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47293 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47294 Opcode != ISD::ANY_EXTEND)
47295 return SDValue();
47296 if (!DCI.isBeforeLegalizeOps())
47297 return SDValue();
47298 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47299 return SDValue();
47300
47301 EVT SVT = VT.getScalarType();
47302 EVT InSVT = N0.getValueType().getScalarType();
47303 unsigned EltSizeInBits = SVT.getSizeInBits();
47304
47305 // Input type must be extending a bool vector (bit-casted from a scalar
47306 // integer) to legal integer types.
47307 if (!VT.isVector())
47308 return SDValue();
47309 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47310 return SDValue();
47311 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47312 return SDValue();
47313
47314 SDValue N00 = N0.getOperand(0);
47315 EVT SclVT = N00.getValueType();
47316 if (!SclVT.isScalarInteger())
47317 return SDValue();
47318
47319 SDValue Vec;
47320 SmallVector<int> ShuffleMask;
47321 unsigned NumElts = VT.getVectorNumElements();
47322 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47323
47324 // Broadcast the scalar integer to the vector elements.
47325 if (NumElts > EltSizeInBits) {
47326 // If the scalar integer is greater than the vector element size, then we
47327 // must split it down into sub-sections for broadcasting. For example:
47328 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47329 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47330 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47331 unsigned Scale = NumElts / EltSizeInBits;
47332 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47333 bool UseBroadcast = Subtarget.hasInt256() &&
47334 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47335 Vec = UseBroadcast
47336 ? DAG.getSplat(BroadcastVT, DL, N00)
47337 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47338 Vec = DAG.getBitcast(VT, Vec);
47339
47340 for (unsigned i = 0; i != Scale; ++i) {
47341 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47342 ShuffleMask.append(EltSizeInBits, i + Offset);
47343 }
47344 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47345 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47346 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47347 // If we have register broadcast instructions, use the scalar size as the
47348 // element type for the shuffle. Then cast to the wider element type. The
47349 // widened bits won't be used, and this might allow the use of a broadcast
47350 // load.
47351 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47352 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47353 (NumElts * EltSizeInBits) / NumElts);
47354 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47355 } else {
47356 // For smaller scalar integers, we can simply any-extend it to the vector
47357 // element size (we don't care about the upper bits) and broadcast it to all
47358 // elements.
47359 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47360 }
47361
47362 // Now, mask the relevant bit in each element.
47364 for (unsigned i = 0; i != NumElts; ++i) {
47365 int BitIdx = (i % EltSizeInBits);
47366 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47367 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47368 }
47369 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47370 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47371
47372 // Compare against the bitmask and extend the result.
47373 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47374 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47375 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47376
47377 // For SEXT, this is now done, otherwise shift the result down for
47378 // zero-extension.
47379 if (Opcode == ISD::SIGN_EXTEND)
47380 return Vec;
47381 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47382 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47383}
47384
47385/// If both arms of a vector select are concatenated vectors, split the select,
47386/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47387/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47388/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47390 const X86Subtarget &Subtarget) {
47391 unsigned Opcode = N->getOpcode();
47392 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47393 return SDValue();
47394
47395 // TODO: Split 512-bit vectors too?
47396 EVT VT = N->getValueType(0);
47397 if (!VT.is256BitVector())
47398 return SDValue();
47399
47400 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47401 SDValue Cond = N->getOperand(0);
47402 SDValue TVal = N->getOperand(1);
47403 SDValue FVal = N->getOperand(2);
47404 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47405 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47406 return SDValue();
47407
47408 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47410 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47411 };
47412 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47413 /*CheckBWI*/ false);
47414}
47415
47417 const SDLoc &DL) {
47418 SDValue Cond = N->getOperand(0);
47419 SDValue LHS = N->getOperand(1);
47420 SDValue RHS = N->getOperand(2);
47421
47422 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47423 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47424 if (!TrueC || !FalseC)
47425 return SDValue();
47426
47427 // Don't do this for crazy integer types.
47428 EVT VT = N->getValueType(0);
47429 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47430 return SDValue();
47431
47432 // We're going to use the condition bit in math or logic ops. We could allow
47433 // this with a wider condition value (post-legalization it becomes an i8),
47434 // but if nothing is creating selects that late, it doesn't matter.
47435 if (Cond.getValueType() != MVT::i1)
47436 return SDValue();
47437
47438 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47439 // 3, 5, or 9 with i32/i64, so those get transformed too.
47440 // TODO: For constants that overflow or do not differ by power-of-2 or small
47441 // multiplier, convert to 'and' + 'add'.
47442 const APInt &TrueVal = TrueC->getAPIntValue();
47443 const APInt &FalseVal = FalseC->getAPIntValue();
47444
47445 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47446 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47447 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47448 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47449 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47450 return SDValue();
47451 }
47452
47453 bool OV;
47454 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47455 if (OV)
47456 return SDValue();
47457
47458 APInt AbsDiff = Diff.abs();
47459 if (AbsDiff.isPowerOf2() ||
47460 ((VT == MVT::i32 || VT == MVT::i64) &&
47461 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47462
47463 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47464 // of the condition can usually be folded into a compare predicate, but even
47465 // without that, the sequence should be cheaper than a CMOV alternative.
47466 if (TrueVal.slt(FalseVal)) {
47467 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47468 std::swap(TrueC, FalseC);
47469 }
47470
47471 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47472 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47473
47474 // Multiply condition by the difference if non-one.
47475 if (!AbsDiff.isOne())
47476 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47477
47478 // Add the base if non-zero.
47479 if (!FalseC->isZero())
47480 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47481
47482 return R;
47483 }
47484
47485 return SDValue();
47486}
47487
47488/// If this is a *dynamic* select (non-constant condition) and we can match
47489/// this node with one of the variable blend instructions, restructure the
47490/// condition so that blends can use the high (sign) bit of each element.
47491/// This function will also call SimplifyDemandedBits on already created
47492/// BLENDV to perform additional simplifications.
47494 const SDLoc &DL,
47496 const X86Subtarget &Subtarget) {
47497 SDValue Cond = N->getOperand(0);
47498 if ((N->getOpcode() != ISD::VSELECT &&
47499 N->getOpcode() != X86ISD::BLENDV) ||
47501 return SDValue();
47502
47503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47504 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47505 EVT VT = N->getValueType(0);
47506
47507 // We can only handle the cases where VSELECT is directly legal on the
47508 // subtarget. We custom lower VSELECT nodes with constant conditions and
47509 // this makes it hard to see whether a dynamic VSELECT will correctly
47510 // lower, so we both check the operation's status and explicitly handle the
47511 // cases where a *dynamic* blend will fail even though a constant-condition
47512 // blend could be custom lowered.
47513 // FIXME: We should find a better way to handle this class of problems.
47514 // Potentially, we should combine constant-condition vselect nodes
47515 // pre-legalization into shuffles and not mark as many types as custom
47516 // lowered.
47518 return SDValue();
47519 // FIXME: We don't support i16-element blends currently. We could and
47520 // should support them by making *all* the bits in the condition be set
47521 // rather than just the high bit and using an i8-element blend.
47522 if (VT.getVectorElementType() == MVT::i16)
47523 return SDValue();
47524 // Dynamic blending was only available from SSE4.1 onward.
47525 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47526 return SDValue();
47527 // Byte blends are only available in AVX2
47528 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47529 return SDValue();
47530 // There are no 512-bit blend instructions that use sign bits.
47531 if (VT.is512BitVector())
47532 return SDValue();
47533
47534 // Don't optimize before the condition has been transformed to a legal type
47535 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47537 return SDValue();
47538
47539 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47540 for (SDUse &Use : Cond->uses())
47541 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47542 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47543 Use.getOperandNo() != 0)
47544 return false;
47545
47546 return true;
47547 };
47548
47550
47551 if (OnlyUsedAsSelectCond(Cond)) {
47552 KnownBits Known;
47554 !DCI.isBeforeLegalizeOps());
47555 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47556 return SDValue();
47557
47558 // If we changed the computation somewhere in the DAG, this change will
47559 // affect all users of Cond. Update all the nodes so that we do not use
47560 // the generic VSELECT anymore. Otherwise, we may perform wrong
47561 // optimizations as we messed with the actual expectation for the vector
47562 // boolean values.
47563 for (SDNode *U : Cond->users()) {
47564 if (U->getOpcode() == X86ISD::BLENDV)
47565 continue;
47566
47567 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47568 Cond, U->getOperand(1), U->getOperand(2));
47569 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47570 DCI.AddToWorklist(U);
47571 }
47572 DCI.CommitTargetLoweringOpt(TLO);
47573 return SDValue(N, 0);
47574 }
47575
47576 // Otherwise we can still at least try to simplify multiple use bits.
47578 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47579 N->getOperand(1), N->getOperand(2));
47580
47581 return SDValue();
47582}
47583
47584// Try to match:
47585// (or (and (M, (sub 0, X)), (pandn M, X)))
47586// which is a special case of:
47587// (select M, (sub 0, X), X)
47588// Per:
47589// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47590// We know that, if fNegate is 0 or 1:
47591// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47592//
47593// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47594// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47595// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47596// This lets us transform our vselect to:
47597// (add (xor X, M), (and M, 1))
47598// And further to:
47599// (sub (xor X, M), M)
47601 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47602 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47603 using namespace SDPatternMatch;
47604 EVT MaskVT = Mask.getValueType();
47605 assert(MaskVT.isInteger() &&
47606 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47607 "Mask must be zero/all-bits");
47608
47609 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47611 return SDValue();
47612
47613 SDValue V;
47614 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47616 return SDValue();
47617
47618 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47619 SDValue SubOp2 = Mask;
47620
47621 // If the negate was on the false side of the select, then
47622 // the operands of the SUB need to be swapped. PR 27251.
47623 // This is because the pattern being matched above is
47624 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47625 // but if the pattern matched was
47626 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47627 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47628 // pattern also needs to be a negation of the replacement pattern above.
47629 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47630 // sub accomplishes the negation of the replacement pattern.
47631 if (V == Y)
47632 std::swap(SubOp1, SubOp2);
47633
47634 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47635 return DAG.getBitcast(VT, Res);
47636}
47637
47639 const X86Subtarget &Subtarget) {
47640 using namespace SDPatternMatch;
47641 if (!Subtarget.hasAVX512())
47642 return SDValue();
47643
47644 ISD::CondCode CC;
47645 SDValue Cond, X, Y, LHS, RHS;
47648 m_CondCode(CC)))),
47649 m_Value(LHS), m_Value(RHS))))
47650 return SDValue();
47651
47652 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47653 !canCombineAsMaskOperation(RHS, Subtarget))
47654 return SDValue();
47655
47656 // Commute LHS and RHS to create opportunity to select mask instruction.
47657 // (vselect M, L, R) -> (vselect ~M, R, L)
47658 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47659 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47660 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47661}
47662
47663/// Do target-specific dag combines on SELECT and VSELECT nodes.
47666 const X86Subtarget &Subtarget) {
47667 SDLoc DL(N);
47668 SDValue Cond = N->getOperand(0);
47669 SDValue LHS = N->getOperand(1);
47670 SDValue RHS = N->getOperand(2);
47671
47672 // Try simplification again because we use this function to optimize
47673 // BLENDV nodes that are not handled by the generic combiner.
47674 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47675 return V;
47676
47677 // When avx512 is available the lhs operand of select instruction can be
47678 // folded with mask instruction, while the rhs operand can't. Commute the
47679 // lhs and rhs of the select instruction to create the opportunity of
47680 // folding.
47681 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47682 return V;
47683
47684 EVT VT = LHS.getValueType();
47685 EVT CondVT = Cond.getValueType();
47686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47687 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47688
47689 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47690 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47691 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47692 if (CondVT.isVector() && CondVT.isInteger() &&
47693 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47694 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47697 DL, DAG, Subtarget))
47698 return V;
47699
47700 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47701 SmallVector<int, 64> CondMask;
47702 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47703 N->getOpcode() == X86ISD::BLENDV)) {
47704 // Convert vselects with constant condition into shuffles.
47705 if (DCI.isBeforeLegalizeOps())
47706 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47707
47708 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47709 // by forcing the unselected elements to zero.
47710 // TODO: Can we handle more shuffles with this?
47711 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47712 SmallVector<SDValue, 1> LHSOps, RHSOps;
47713 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47716 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47717 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47718 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47719 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47720 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47721 assert(ByteMask.size() == LHSMask.size() &&
47722 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47723 for (auto [I, M] : enumerate(ByteMask)) {
47724 // getConstVector sets negative shuffle mask values as undef, so
47725 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47726 if (M < (int)ByteMask.size()) {
47727 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47728 RHSMask[I] = 0x80;
47729 } else {
47730 LHSMask[I] = 0x80;
47731 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47732 }
47733 }
47734 MVT ByteVT = LHSShuf.getSimpleValueType();
47735 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47736 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47737 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47738 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47739 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47740 }
47741 }
47742
47743 // Attempt to combine as shuffle.
47744 SDValue Op(N, 0);
47745 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47746 return Res;
47747 }
47748 }
47749
47750 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47751 // instructions match the semantics of the common C idiom x<y?x:y but not
47752 // x<=y?x:y, because of how they handle negative zero (which can be
47753 // ignored in unsafe-math mode).
47754 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47755 if ((Cond.getOpcode() == ISD::SETCC ||
47756 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47757 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47758 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47759 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47760 (Subtarget.hasSSE2() ||
47761 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47762 bool IsStrict = Cond->isStrictFPOpcode();
47763 ISD::CondCode CC =
47764 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47765 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47766 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47767
47768 unsigned Opcode = 0;
47769 // Check for x CC y ? x : y.
47770 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47771 switch (CC) {
47772 default: break;
47773 case ISD::SETULT:
47774 // Converting this to a min would handle NaNs incorrectly, and swapping
47775 // the operands would cause it to handle comparisons between positive
47776 // and negative zero incorrectly.
47777 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47779 !(DAG.isKnownNeverZeroFloat(LHS) ||
47781 break;
47782 std::swap(LHS, RHS);
47783 }
47784 Opcode = X86ISD::FMIN;
47785 break;
47786 case ISD::SETOLE:
47787 // Converting this to a min would handle comparisons between positive
47788 // and negative zero incorrectly.
47791 break;
47792 Opcode = X86ISD::FMIN;
47793 break;
47794 case ISD::SETULE:
47795 // Converting this to a min would handle both negative zeros and NaNs
47796 // incorrectly, but we can swap the operands to fix both.
47797 std::swap(LHS, RHS);
47798 [[fallthrough]];
47799 case ISD::SETOLT:
47800 case ISD::SETLT:
47801 case ISD::SETLE:
47802 Opcode = X86ISD::FMIN;
47803 break;
47804
47805 case ISD::SETOGE:
47806 // Converting this to a max would handle comparisons between positive
47807 // and negative zero incorrectly.
47810 break;
47811 Opcode = X86ISD::FMAX;
47812 break;
47813 case ISD::SETUGT:
47814 // Converting this to a max would handle NaNs incorrectly, and swapping
47815 // the operands would cause it to handle comparisons between positive
47816 // and negative zero incorrectly.
47817 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47819 !(DAG.isKnownNeverZeroFloat(LHS) ||
47821 break;
47822 std::swap(LHS, RHS);
47823 }
47824 Opcode = X86ISD::FMAX;
47825 break;
47826 case ISD::SETUGE:
47827 // Converting this to a max would handle both negative zeros and NaNs
47828 // incorrectly, but we can swap the operands to fix both.
47829 std::swap(LHS, RHS);
47830 [[fallthrough]];
47831 case ISD::SETOGT:
47832 case ISD::SETGT:
47833 case ISD::SETGE:
47834 Opcode = X86ISD::FMAX;
47835 break;
47836 }
47837 // Check for x CC y ? y : x -- a min/max with reversed arms.
47838 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47839 switch (CC) {
47840 default: break;
47841 case ISD::SETOGE:
47842 // Converting this to a min would handle comparisons between positive
47843 // and negative zero incorrectly, and swapping the operands would
47844 // cause it to handle NaNs incorrectly.
47846 !(DAG.isKnownNeverZeroFloat(LHS) ||
47847 DAG.isKnownNeverZeroFloat(RHS))) {
47848 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47849 break;
47850 std::swap(LHS, RHS);
47851 }
47852 Opcode = X86ISD::FMIN;
47853 break;
47854 case ISD::SETUGT:
47855 // Converting this to a min would handle NaNs incorrectly.
47856 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47857 break;
47858 Opcode = X86ISD::FMIN;
47859 break;
47860 case ISD::SETUGE:
47861 // Converting this to a min would handle both negative zeros and NaNs
47862 // incorrectly, but we can swap the operands to fix both.
47863 std::swap(LHS, RHS);
47864 [[fallthrough]];
47865 case ISD::SETOGT:
47866 case ISD::SETGT:
47867 case ISD::SETGE:
47868 Opcode = X86ISD::FMIN;
47869 break;
47870
47871 case ISD::SETULT:
47872 // Converting this to a max would handle NaNs incorrectly.
47873 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47874 break;
47875 Opcode = X86ISD::FMAX;
47876 break;
47877 case ISD::SETOLE:
47878 // Converting this to a max would handle comparisons between positive
47879 // and negative zero incorrectly, and swapping the operands would
47880 // cause it to handle NaNs incorrectly.
47882 !DAG.isKnownNeverZeroFloat(LHS) &&
47883 !DAG.isKnownNeverZeroFloat(RHS)) {
47884 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47885 break;
47886 std::swap(LHS, RHS);
47887 }
47888 Opcode = X86ISD::FMAX;
47889 break;
47890 case ISD::SETULE:
47891 // Converting this to a max would handle both negative zeros and NaNs
47892 // incorrectly, but we can swap the operands to fix both.
47893 std::swap(LHS, RHS);
47894 [[fallthrough]];
47895 case ISD::SETOLT:
47896 case ISD::SETLT:
47897 case ISD::SETLE:
47898 Opcode = X86ISD::FMAX;
47899 break;
47900 }
47901 }
47902
47903 if (Opcode) {
47904 if (IsStrict) {
47905 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47907 DL, {N->getValueType(0), MVT::Other},
47908 {Cond.getOperand(0), LHS, RHS});
47909 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47910 return Ret;
47911 }
47912 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47913 }
47914 }
47915
47916 // Some mask scalar intrinsics rely on checking if only one bit is set
47917 // and implement it in C code like this:
47918 // A[0] = (U & 1) ? A[0] : W[0];
47919 // This creates some redundant instructions that break pattern matching.
47920 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47921 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47922 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47923 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47924 SDValue AndNode = Cond.getOperand(0);
47925 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47926 isNullConstant(Cond.getOperand(1)) &&
47927 isOneConstant(AndNode.getOperand(1))) {
47928 // LHS and RHS swapped due to
47929 // setcc outputting 1 when AND resulted in 0 and vice versa.
47930 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47931 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47932 }
47933 }
47934
47935 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47936 // lowering on KNL. In this case we convert it to
47937 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47938 // The same situation all vectors of i8 and i16 without BWI.
47939 // Make sure we extend these even before type legalization gets a chance to
47940 // split wide vectors.
47941 // Since SKX these selects have a proper lowering.
47942 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47943 CondVT.getVectorElementType() == MVT::i1 &&
47944 (VT.getVectorElementType() == MVT::i8 ||
47945 VT.getVectorElementType() == MVT::i16)) {
47946 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47947 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47948 }
47949
47950 // AVX512 - Extend select to merge with target shuffle.
47951 // select(mask, extract_subvector(shuffle(x)), y) -->
47952 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47953 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47954 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47955 CondVT.getVectorElementType() == MVT::i1) {
47956 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
47957 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47958 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47959 isNullConstant(Op.getOperand(1)) &&
47960 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47961 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
47962 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
47963 ISD::isBuildVectorAllZeros(Alt.getNode()));
47964 };
47965
47966 bool SelectableLHS = SelectableOp(LHS, RHS);
47967 bool SelectableRHS = SelectableOp(RHS, LHS);
47968 if (SelectableLHS || SelectableRHS) {
47969 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47970 : RHS.getOperand(0).getValueType();
47971 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47972 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47973 VT.getSizeInBits());
47974 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47975 VT.getSizeInBits());
47976 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47977 DAG.getUNDEF(SrcCondVT), Cond,
47978 DAG.getVectorIdxConstant(0, DL));
47979 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47980 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47981 }
47982 }
47983
47984 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
47985 return V;
47986
47987 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47988 Cond.hasOneUse()) {
47989 EVT CondVT = Cond.getValueType();
47990 SDValue Cond0 = Cond.getOperand(0);
47991 SDValue Cond1 = Cond.getOperand(1);
47992 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47993
47994 // Canonicalize min/max:
47995 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47996 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47997 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47998 // the need for an extra compare against zero. e.g.
47999 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48000 // subl %esi, %edi
48001 // testl %edi, %edi
48002 // movl $0, %eax
48003 // cmovgl %edi, %eax
48004 // =>
48005 // xorl %eax, %eax
48006 // subl %esi, $edi
48007 // cmovsl %eax, %edi
48008 //
48009 // We can also canonicalize
48010 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48011 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48012 // This allows the use of a test instruction for the compare.
48013 if (LHS == Cond0 && RHS == Cond1) {
48014 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48015 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48017 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48018 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48019 }
48020 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48021 ISD::CondCode NewCC = ISD::SETUGE;
48022 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48023 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48024 }
48025 }
48026
48027 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48028 // fold eq + gt/lt nested selects into ge/le selects
48029 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48030 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48031 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48032 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48033 // .. etc ..
48034 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48035 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48036 SDValue InnerSetCC = RHS.getOperand(0);
48037 ISD::CondCode InnerCC =
48038 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48039 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48040 Cond0 == InnerSetCC.getOperand(0) &&
48041 Cond1 == InnerSetCC.getOperand(1)) {
48042 ISD::CondCode NewCC;
48043 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48044 // clang-format off
48045 case ISD::SETGT: NewCC = ISD::SETGE; break;
48046 case ISD::SETLT: NewCC = ISD::SETLE; break;
48047 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48048 case ISD::SETULT: NewCC = ISD::SETULE; break;
48049 default: NewCC = ISD::SETCC_INVALID; break;
48050 // clang-format on
48051 }
48052 if (NewCC != ISD::SETCC_INVALID) {
48053 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48054 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48055 }
48056 }
48057 }
48058 }
48059
48060 // Check if the first operand is all zeros and Cond type is vXi1.
48061 // If this an avx512 target we can improve the use of zero masking by
48062 // swapping the operands and inverting the condition.
48063 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48064 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48065 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48066 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48067 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48068 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48069 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48070 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48071 }
48072
48073 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48074 // get split by legalization.
48075 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48076 CondVT.getVectorElementType() == MVT::i1 &&
48077 TLI.isTypeLegal(VT.getScalarType())) {
48078 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48080 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48081 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48082 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48083 }
48084 }
48085
48086 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48087 // with out-of-bounds clamping.
48088
48089 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48090 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48091 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48092 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48093 // exceeding bitwidth-1.
48094 if (N->getOpcode() == ISD::VSELECT) {
48095 using namespace llvm::SDPatternMatch;
48096 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48097 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48098 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48099 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48101 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48104 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48105 : X86ISD::VSHLV,
48106 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48107 }
48108 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48109 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48110 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48111 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48113 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48116 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48117 : X86ISD::VSHLV,
48118 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48119 }
48120 }
48121
48122 // Early exit check
48123 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48124 return SDValue();
48125
48126 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48127 return V;
48128
48129 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48130 return V;
48131
48132 // select(~Cond, X, Y) -> select(Cond, Y, X)
48133 if (CondVT.getScalarType() != MVT::i1) {
48134 if (SDValue CondNot = IsNOT(Cond, DAG))
48135 return DAG.getNode(N->getOpcode(), DL, VT,
48136 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48137
48138 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48139 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48140 Cond.getOperand(0).getOpcode() == ISD::AND &&
48141 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48142 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48143 Cond.getScalarValueSizeInBits(),
48144 /*AllowUndefs=*/true) &&
48145 Cond.hasOneUse()) {
48146 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48147 Cond.getOperand(0).getOperand(1));
48148 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48149 }
48150
48151 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48152 // signbit.
48153 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48154 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48155 Cond.hasOneUse()) {
48156 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48157 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48158 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48159 }
48160 }
48161
48162 // Try to optimize vXi1 selects if both operands are either all constants or
48163 // bitcasts from scalar integer type. In that case we can convert the operands
48164 // to integer and use an integer select which will be converted to a CMOV.
48165 // We need to take a little bit of care to avoid creating an i64 type after
48166 // type legalization.
48167 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48168 VT.getVectorElementType() == MVT::i1 &&
48169 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48171 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48172 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48173 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48174
48175 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48176 LHS.getOperand(0).getValueType() == IntVT)) &&
48177 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48178 RHS.getOperand(0).getValueType() == IntVT))) {
48179 if (LHSIsConst)
48181 else
48182 LHS = LHS.getOperand(0);
48183
48184 if (RHSIsConst)
48186 else
48187 RHS = RHS.getOperand(0);
48188
48189 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48190 return DAG.getBitcast(VT, Select);
48191 }
48192 }
48193 }
48194
48195 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48196 // single bits, then invert the predicate and swap the select operands.
48197 // This can lower using a vector shift bit-hack rather than mask and compare.
48198 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48199 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48200 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48201 Cond.getOperand(0).getOpcode() == ISD::AND &&
48202 isNullOrNullSplat(Cond.getOperand(1)) &&
48203 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48204 Cond.getOperand(0).getValueType() == VT) {
48205 // The 'and' mask must be composed of power-of-2 constants.
48206 SDValue And = Cond.getOperand(0);
48207 auto *C = isConstOrConstSplat(And.getOperand(1));
48208 if (C && C->getAPIntValue().isPowerOf2()) {
48209 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48210 SDValue NotCond =
48211 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48212 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48213 }
48214
48215 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48216 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48217 // 16-bit lacks a proper blendv.
48218 unsigned EltBitWidth = VT.getScalarSizeInBits();
48219 bool CanShiftBlend =
48220 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48221 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48222 (Subtarget.hasXOP()));
48223 if (CanShiftBlend &&
48224 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48225 return C->getAPIntValue().isPowerOf2();
48226 })) {
48227 // Create a left-shift constant to get the mask bits over to the sign-bit.
48228 SDValue Mask = And.getOperand(1);
48229 SmallVector<int, 32> ShlVals;
48230 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48231 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48232 ShlVals.push_back(EltBitWidth - 1 -
48233 MaskVal->getAPIntValue().exactLogBase2());
48234 }
48235 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48236 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48237 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48238 SDValue NewCond =
48239 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48240 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48241 }
48242 }
48243
48244 return SDValue();
48245}
48246
48247/// Combine:
48248/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48249/// to:
48250/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48251/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48252/// Note that this is only legal for some op/cc combinations.
48254 SelectionDAG &DAG,
48255 const X86Subtarget &Subtarget) {
48256 // This combine only operates on CMP-like nodes.
48257 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48258 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48259 return SDValue();
48260
48261 // Can't replace the cmp if it has more uses than the one we're looking at.
48262 // FIXME: We would like to be able to handle this, but would need to make sure
48263 // all uses were updated.
48264 if (!Cmp.hasOneUse())
48265 return SDValue();
48266
48267 // This only applies to variations of the common case:
48268 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48269 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48270 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48271 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48272 // Using the proper condcodes (see below), overflow is checked for.
48273
48274 // FIXME: We can generalize both constraints:
48275 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48276 // - LHS != 1
48277 // if the result is compared.
48278
48279 SDValue CmpLHS = Cmp.getOperand(0);
48280 SDValue CmpRHS = Cmp.getOperand(1);
48281 EVT CmpVT = CmpLHS.getValueType();
48282
48283 if (!CmpLHS.hasOneUse())
48284 return SDValue();
48285
48286 unsigned Opc = CmpLHS.getOpcode();
48287 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48288 return SDValue();
48289
48290 SDValue OpRHS = CmpLHS.getOperand(2);
48291 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48292 if (!OpRHSC)
48293 return SDValue();
48294
48295 APInt Addend = OpRHSC->getAPIntValue();
48296 if (Opc == ISD::ATOMIC_LOAD_SUB)
48297 Addend = -Addend;
48298
48299 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48300 if (!CmpRHSC)
48301 return SDValue();
48302
48303 APInt Comparison = CmpRHSC->getAPIntValue();
48304 APInt NegAddend = -Addend;
48305
48306 // See if we can adjust the CC to make the comparison match the negated
48307 // addend.
48308 if (Comparison != NegAddend) {
48309 APInt IncComparison = Comparison + 1;
48310 if (IncComparison == NegAddend) {
48311 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48312 Comparison = IncComparison;
48313 CC = X86::COND_AE;
48314 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48315 Comparison = IncComparison;
48316 CC = X86::COND_L;
48317 }
48318 }
48319 APInt DecComparison = Comparison - 1;
48320 if (DecComparison == NegAddend) {
48321 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48322 Comparison = DecComparison;
48323 CC = X86::COND_A;
48324 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48325 Comparison = DecComparison;
48326 CC = X86::COND_LE;
48327 }
48328 }
48329 }
48330
48331 // If the addend is the negation of the comparison value, then we can do
48332 // a full comparison by emitting the atomic arithmetic as a locked sub.
48333 if (Comparison == NegAddend) {
48334 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48335 // atomic sub.
48336 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48337 auto AtomicSub = DAG.getAtomic(
48338 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48339 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48340 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48341 AN->getMemOperand());
48342 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48343 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48344 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48345 return LockOp;
48346 }
48347
48348 // We can handle comparisons with zero in a number of cases by manipulating
48349 // the CC used.
48350 if (!Comparison.isZero())
48351 return SDValue();
48352
48353 if (CC == X86::COND_S && Addend == 1)
48354 CC = X86::COND_LE;
48355 else if (CC == X86::COND_NS && Addend == 1)
48356 CC = X86::COND_G;
48357 else if (CC == X86::COND_G && Addend == -1)
48358 CC = X86::COND_GE;
48359 else if (CC == X86::COND_LE && Addend == -1)
48360 CC = X86::COND_L;
48361 else
48362 return SDValue();
48363
48364 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48365 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48366 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48367 return LockOp;
48368}
48369
48370// Check whether we're just testing the signbit, and whether we can simplify
48371// this by tracking where the signbit came from.
48373 SelectionDAG &DAG) {
48374 if (CC != X86::COND_S && CC != X86::COND_NS)
48375 return SDValue();
48376
48377 if (!Cmp.hasOneUse())
48378 return SDValue();
48379
48380 SDValue Src;
48381 if (Cmp.getOpcode() == X86ISD::CMP) {
48382 // CMP(X,0) -> signbit test
48383 if (!isNullConstant(Cmp.getOperand(1)))
48384 return SDValue();
48385 Src = Cmp.getOperand(0);
48386 // Peek through a SRA node as we just need the signbit.
48387 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48388 // TODO: Use SimplifyDemandedBits instead of just SRA?
48389 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48390 return SDValue();
48391 Src = Src.getOperand(0);
48392 } else if (Cmp.getOpcode() == X86ISD::OR) {
48393 // OR(X,Y) -> see if only one operand contributes to the signbit.
48394 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48395 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48396 Src = Cmp.getOperand(1);
48397 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48398 Src = Cmp.getOperand(0);
48399 else
48400 return SDValue();
48401 } else {
48402 return SDValue();
48403 }
48404
48405 // Replace with a TEST on the MSB.
48406 SDLoc DL(Cmp);
48407 MVT SrcVT = Src.getSimpleValueType();
48408 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48409
48410 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48411 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48412 if (Src.getOpcode() == ISD::SHL) {
48413 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48414 Src = Src.getOperand(0);
48415 BitMask.lshrInPlace(*ShiftAmt);
48416 }
48417 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48418 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48419 Src = Src.getOperand(0);
48420 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48421 }
48422
48423 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48424 DAG.getConstant(BitMask, DL, SrcVT));
48425 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48426 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48427 DAG.getConstant(0, DL, SrcVT));
48428}
48429
48430// Check whether a boolean test is testing a boolean value generated by
48431// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48432// code.
48433//
48434// Simplify the following patterns:
48435// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48436// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48437// to (Op EFLAGS Cond)
48438//
48439// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48440// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48441// to (Op EFLAGS !Cond)
48442//
48443// where Op could be BRCOND or CMOV.
48444//
48446 // This combine only operates on CMP-like nodes.
48447 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48448 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48449 return SDValue();
48450
48451 // Quit if not used as a boolean value.
48452 if (CC != X86::COND_E && CC != X86::COND_NE)
48453 return SDValue();
48454
48455 // Check CMP operands. One of them should be 0 or 1 and the other should be
48456 // an SetCC or extended from it.
48457 SDValue Op1 = Cmp.getOperand(0);
48458 SDValue Op2 = Cmp.getOperand(1);
48459
48460 SDValue SetCC;
48461 const ConstantSDNode* C = nullptr;
48462 bool needOppositeCond = (CC == X86::COND_E);
48463 bool checkAgainstTrue = false; // Is it a comparison against 1?
48464
48465 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48466 SetCC = Op2;
48467 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48468 SetCC = Op1;
48469 else // Quit if all operands are not constants.
48470 return SDValue();
48471
48472 if (C->getZExtValue() == 1) {
48473 needOppositeCond = !needOppositeCond;
48474 checkAgainstTrue = true;
48475 } else if (C->getZExtValue() != 0)
48476 // Quit if the constant is neither 0 or 1.
48477 return SDValue();
48478
48479 bool truncatedToBoolWithAnd = false;
48480 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48481 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48482 SetCC.getOpcode() == ISD::TRUNCATE ||
48483 SetCC.getOpcode() == ISD::AND) {
48484 if (SetCC.getOpcode() == ISD::AND) {
48485 int OpIdx = -1;
48486 if (isOneConstant(SetCC.getOperand(0)))
48487 OpIdx = 1;
48488 if (isOneConstant(SetCC.getOperand(1)))
48489 OpIdx = 0;
48490 if (OpIdx < 0)
48491 break;
48492 SetCC = SetCC.getOperand(OpIdx);
48493 truncatedToBoolWithAnd = true;
48494 } else
48495 SetCC = SetCC.getOperand(0);
48496 }
48497
48498 switch (SetCC.getOpcode()) {
48500 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48501 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48502 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48503 // truncated to i1 using 'and'.
48504 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48505 break;
48507 "Invalid use of SETCC_CARRY!");
48508 [[fallthrough]];
48509 case X86ISD::SETCC:
48510 // Set the condition code or opposite one if necessary.
48511 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48512 if (needOppositeCond)
48514 return SetCC.getOperand(1);
48515 case X86ISD::CMOV: {
48516 // Check whether false/true value has canonical one, i.e. 0 or 1.
48519 // Quit if true value is not a constant.
48520 if (!TVal)
48521 return SDValue();
48522 // Quit if false value is not a constant.
48523 if (!FVal) {
48524 SDValue Op = SetCC.getOperand(0);
48525 // Skip 'zext' or 'trunc' node.
48526 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48527 Op.getOpcode() == ISD::TRUNCATE)
48528 Op = Op.getOperand(0);
48529 // A special case for rdrand/rdseed, where 0 is set if false cond is
48530 // found.
48531 if ((Op.getOpcode() != X86ISD::RDRAND &&
48532 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48533 return SDValue();
48534 }
48535 // Quit if false value is not the constant 0 or 1.
48536 bool FValIsFalse = true;
48537 if (FVal && FVal->getZExtValue() != 0) {
48538 if (FVal->getZExtValue() != 1)
48539 return SDValue();
48540 // If FVal is 1, opposite cond is needed.
48541 needOppositeCond = !needOppositeCond;
48542 FValIsFalse = false;
48543 }
48544 // Quit if TVal is not the constant opposite of FVal.
48545 if (FValIsFalse && TVal->getZExtValue() != 1)
48546 return SDValue();
48547 if (!FValIsFalse && TVal->getZExtValue() != 0)
48548 return SDValue();
48549 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48550 if (needOppositeCond)
48552 return SetCC.getOperand(3);
48553 }
48554 }
48555
48556 return SDValue();
48557}
48558
48559/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48560/// Match:
48561/// (X86or (X86setcc) (X86setcc))
48562/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48564 X86::CondCode &CC1, SDValue &Flags,
48565 bool &isAnd) {
48566 if (Cond->getOpcode() == X86ISD::CMP) {
48567 if (!isNullConstant(Cond->getOperand(1)))
48568 return false;
48569
48570 Cond = Cond->getOperand(0);
48571 }
48572
48573 isAnd = false;
48574
48575 SDValue SetCC0, SetCC1;
48576 switch (Cond->getOpcode()) {
48577 default: return false;
48578 case ISD::AND:
48579 case X86ISD::AND:
48580 isAnd = true;
48581 [[fallthrough]];
48582 case ISD::OR:
48583 case X86ISD::OR:
48584 SetCC0 = Cond->getOperand(0);
48585 SetCC1 = Cond->getOperand(1);
48586 break;
48587 };
48588
48589 // Make sure we have SETCC nodes, using the same flags value.
48590 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48591 SetCC1.getOpcode() != X86ISD::SETCC ||
48592 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48593 return false;
48594
48595 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48596 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48597 Flags = SetCC0->getOperand(1);
48598 return true;
48599}
48600
48601// When legalizing carry, we create carries via add X, -1
48602// If that comes from an actual carry, via setcc, we use the
48603// carry directly.
48605 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48606 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48607 bool FoundAndLSB = false;
48608 SDValue Carry = EFLAGS.getOperand(0);
48609 while (Carry.getOpcode() == ISD::TRUNCATE ||
48610 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48611 (Carry.getOpcode() == ISD::AND &&
48612 isOneConstant(Carry.getOperand(1)))) {
48613 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48614 Carry = Carry.getOperand(0);
48615 }
48616 if (Carry.getOpcode() == X86ISD::SETCC ||
48617 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48618 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48619 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48620 SDValue CarryOp1 = Carry.getOperand(1);
48621 if (CarryCC == X86::COND_B)
48622 return CarryOp1;
48623 if (CarryCC == X86::COND_A) {
48624 // Try to convert COND_A into COND_B in an attempt to facilitate
48625 // materializing "setb reg".
48626 //
48627 // Do not flip "e > c", where "c" is a constant, because Cmp
48628 // instruction cannot take an immediate as its first operand.
48629 //
48630 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48631 CarryOp1.getNode()->hasOneUse() &&
48632 CarryOp1.getValueType().isInteger() &&
48633 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48634 SDValue SubCommute =
48635 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48636 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48637 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48638 }
48639 }
48640 // If this is a check of the z flag of an add with 1, switch to the
48641 // C flag.
48642 if (CarryCC == X86::COND_E &&
48643 CarryOp1.getOpcode() == X86ISD::ADD &&
48644 isOneConstant(CarryOp1.getOperand(1)))
48645 return CarryOp1;
48646 } else if (FoundAndLSB) {
48647 SDLoc DL(Carry);
48648 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48649 if (Carry.getOpcode() == ISD::SRL) {
48650 BitNo = Carry.getOperand(1);
48651 Carry = Carry.getOperand(0);
48652 }
48653 return getBT(Carry, BitNo, DL, DAG);
48654 }
48655 }
48656 }
48657
48658 return SDValue();
48659}
48660
48661/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48662/// to avoid the inversion.
48664 SelectionDAG &DAG,
48665 const X86Subtarget &Subtarget) {
48666 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48667 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48668 EFLAGS.getOpcode() != X86ISD::TESTP)
48669 return SDValue();
48670
48671 // PTEST/TESTP sets EFLAGS as:
48672 // TESTZ: ZF = (Op0 & Op1) == 0
48673 // TESTC: CF = (~Op0 & Op1) == 0
48674 // TESTNZC: ZF == 0 && CF == 0
48675 MVT VT = EFLAGS.getSimpleValueType();
48676 SDValue Op0 = EFLAGS.getOperand(0);
48677 SDValue Op1 = EFLAGS.getOperand(1);
48678 MVT OpVT = Op0.getSimpleValueType();
48679 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48680
48681 // TEST*(~X,Y) == TEST*(X,Y)
48682 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48683 X86::CondCode InvCC;
48684 switch (CC) {
48685 case X86::COND_B:
48686 // testc -> testz.
48687 InvCC = X86::COND_E;
48688 break;
48689 case X86::COND_AE:
48690 // !testc -> !testz.
48691 InvCC = X86::COND_NE;
48692 break;
48693 case X86::COND_E:
48694 // testz -> testc.
48695 InvCC = X86::COND_B;
48696 break;
48697 case X86::COND_NE:
48698 // !testz -> !testc.
48699 InvCC = X86::COND_AE;
48700 break;
48701 case X86::COND_A:
48702 case X86::COND_BE:
48703 // testnzc -> testnzc (no change).
48704 InvCC = CC;
48705 break;
48706 default:
48707 InvCC = X86::COND_INVALID;
48708 break;
48709 }
48710
48711 if (InvCC != X86::COND_INVALID) {
48712 CC = InvCC;
48713 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48714 DAG.getBitcast(OpVT, NotOp0), Op1);
48715 }
48716 }
48717
48718 if (CC == X86::COND_B || CC == X86::COND_AE) {
48719 // TESTC(X,~X) == TESTC(X,-1)
48720 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48721 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48722 SDLoc DL(EFLAGS);
48723 return DAG.getNode(
48724 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48725 DAG.getBitcast(OpVT,
48726 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48727 }
48728 }
48729 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48730 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48732 SDValue BC0 = peekThroughBitcasts(Op0);
48733 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48735 SDLoc DL(EFLAGS);
48736 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48737 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48738 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48739 }
48740 }
48741 }
48742
48743 if (CC == X86::COND_E || CC == X86::COND_NE) {
48744 // TESTZ(X,~Y) == TESTC(Y,X)
48745 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48746 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48747 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48748 DAG.getBitcast(OpVT, NotOp1), Op0);
48749 }
48750
48751 if (Op0 == Op1) {
48752 SDValue BC = peekThroughBitcasts(Op0);
48753 EVT BCVT = BC.getValueType();
48754
48755 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48756 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48757 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48758 DAG.getBitcast(OpVT, BC.getOperand(0)),
48759 DAG.getBitcast(OpVT, BC.getOperand(1)));
48760 }
48761
48762 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48763 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48764 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48765 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48766 DAG.getBitcast(OpVT, BC.getOperand(0)),
48767 DAG.getBitcast(OpVT, BC.getOperand(1)));
48768 }
48769
48770 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48771 // to more efficiently extract the sign bits and compare that.
48772 // TODO: Handle TESTC with comparison inversion.
48773 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48774 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48775 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48776 unsigned EltBits = BCVT.getScalarSizeInBits();
48777 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48778 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48779 APInt SignMask = APInt::getSignMask(EltBits);
48780 if (SDValue Res =
48781 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48782 // For vXi16 cases we need to use pmovmksb and extract every other
48783 // sign bit.
48784 SDLoc DL(EFLAGS);
48785 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48786 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48787 MVT FloatVT =
48788 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48789 Res = DAG.getBitcast(FloatVT, Res);
48790 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48791 } else if (EltBits == 16) {
48792 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48793 Res = DAG.getBitcast(MovmskVT, Res);
48794 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48795 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48796 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48797 } else {
48798 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48799 }
48800 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48801 DAG.getConstant(0, DL, MVT::i32));
48802 }
48803 }
48804 }
48805 }
48806
48807 // TESTZ(-1,X) == TESTZ(X,X)
48809 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48810
48811 // TESTZ(X,-1) == TESTZ(X,X)
48813 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48814
48815 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48816 // TODO: Add COND_NE handling?
48817 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48818 SDValue Src0 = peekThroughBitcasts(Op0);
48819 SDValue Src1 = peekThroughBitcasts(Op1);
48820 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48822 peekThroughBitcasts(Src0.getOperand(1)), true);
48824 peekThroughBitcasts(Src1.getOperand(1)), true);
48825 if (Src0 && Src1) {
48826 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48827 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48828 DAG.getBitcast(OpVT2, Src0),
48829 DAG.getBitcast(OpVT2, Src1));
48830 }
48831 }
48832 }
48833 }
48834
48835 return SDValue();
48836}
48837
48838// Attempt to simplify the MOVMSK input based on the comparison type.
48840 SelectionDAG &DAG,
48841 const X86Subtarget &Subtarget) {
48842 // Handle eq/ne against zero (any_of).
48843 // Handle eq/ne against -1 (all_of).
48844 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48845 return SDValue();
48846 if (EFLAGS.getValueType() != MVT::i32)
48847 return SDValue();
48848 unsigned CmpOpcode = EFLAGS.getOpcode();
48849 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48850 return SDValue();
48851 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48852 if (!CmpConstant)
48853 return SDValue();
48854 const APInt &CmpVal = CmpConstant->getAPIntValue();
48855
48856 SDValue CmpOp = EFLAGS.getOperand(0);
48857 unsigned CmpBits = CmpOp.getValueSizeInBits();
48858 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48859
48860 // Peek through any truncate.
48861 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48862 CmpOp = CmpOp.getOperand(0);
48863
48864 // Bail if we don't find a MOVMSK.
48865 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48866 return SDValue();
48867
48868 SDValue Vec = CmpOp.getOperand(0);
48869 MVT VecVT = Vec.getSimpleValueType();
48870 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48871 "Unexpected MOVMSK operand");
48872 unsigned NumElts = VecVT.getVectorNumElements();
48873 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48874
48875 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48876 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48877 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48878 if (!IsAnyOf && !IsAllOf)
48879 return SDValue();
48880
48881 // TODO: Check more combining cases for me.
48882 // Here we check the cmp use number to decide do combining or not.
48883 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48884 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48885 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48886
48887 // See if we can peek through to a vector with a wider element type, if the
48888 // signbits extend down to all the sub-elements as well.
48889 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48890 // potential SimplifyDemandedBits/Elts cases.
48891 // If we looked through a truncate that discard bits, we can't do this
48892 // transform.
48893 // FIXME: We could do this transform for truncates that discarded bits by
48894 // inserting an AND mask between the new MOVMSK and the CMP.
48895 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48896 SDValue BC = peekThroughBitcasts(Vec);
48897 MVT BCVT = BC.getSimpleValueType();
48898 unsigned BCNumElts = BCVT.getVectorNumElements();
48899 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48900 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48901 BCNumEltBits > NumEltBits &&
48902 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48903 SDLoc DL(EFLAGS);
48904 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48905 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48906 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48907 DAG.getConstant(CmpMask, DL, MVT::i32));
48908 }
48909 }
48910
48911 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48912 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48913 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48914 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48915 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48917 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48918 Ops.size() == 2) {
48919 SDLoc DL(EFLAGS);
48920 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48921 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48922 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48923 DAG.getBitcast(SubVT, Ops[0]),
48924 DAG.getBitcast(SubVT, Ops[1]));
48925 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48926 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48927 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48928 DAG.getConstant(CmpMask, DL, MVT::i32));
48929 }
48930 }
48931
48932 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48933 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48934 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48935 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48936 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48937 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48938 SDValue BC = peekThroughBitcasts(Vec);
48939 // Ensure MOVMSK was testing every signbit of BC.
48940 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48941 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48942 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48943 BC.getOperand(0), BC.getOperand(1));
48944 V = DAG.getBitcast(TestVT, V);
48945 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48946 }
48947 // Check for 256-bit split vector cases.
48948 if (BC.getOpcode() == ISD::AND &&
48949 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48950 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48951 SDValue LHS = BC.getOperand(0);
48952 SDValue RHS = BC.getOperand(1);
48953 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48954 LHS.getOperand(0), LHS.getOperand(1));
48955 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48956 RHS.getOperand(0), RHS.getOperand(1));
48957 LHS = DAG.getBitcast(TestVT, LHS);
48958 RHS = DAG.getBitcast(TestVT, RHS);
48959 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48960 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48961 }
48962 }
48963 }
48964
48965 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48966 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48967 // sign bits prior to the comparison with zero unless we know that
48968 // the vXi16 splats the sign bit down to the lower i8 half.
48969 // TODO: Handle all_of patterns.
48970 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48971 SDValue VecOp0 = Vec.getOperand(0);
48972 SDValue VecOp1 = Vec.getOperand(1);
48973 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48974 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48975 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48976 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48977 SDLoc DL(EFLAGS);
48978 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48979 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48980 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48981 if (!SignExt0) {
48982 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48983 DAG.getConstant(0xAAAA, DL, MVT::i16));
48984 }
48985 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48986 DAG.getConstant(0, DL, MVT::i16));
48987 }
48988 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48989 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48990 if (CmpBits >= 16 && Subtarget.hasInt256() &&
48991 (IsAnyOf || (SignExt0 && SignExt1))) {
48992 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48993 SDLoc DL(EFLAGS);
48994 SDValue Result = peekThroughBitcasts(Src);
48995 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48996 Result.getValueType().getVectorNumElements() <= NumElts) {
48997 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48998 Result.getOperand(0), Result.getOperand(1));
48999 V = DAG.getBitcast(MVT::v4i64, V);
49000 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49001 }
49002 Result = DAG.getBitcast(MVT::v32i8, Result);
49003 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49004 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49005 if (!SignExt0 || !SignExt1) {
49006 assert(IsAnyOf &&
49007 "Only perform v16i16 signmasks for any_of patterns");
49008 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49009 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49010 }
49011 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49012 DAG.getConstant(CmpMask, DL, MVT::i32));
49013 }
49014 }
49015 }
49016
49017 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49018 // Since we peek through a bitcast, we need to be careful if the base vector
49019 // type has smaller elements than the MOVMSK type. In that case, even if
49020 // all the elements are demanded by the shuffle mask, only the "high"
49021 // elements which have highbits that align with highbits in the MOVMSK vec
49022 // elements are actually demanded. A simplification of spurious operations
49023 // on the "low" elements take place during other simplifications.
49024 //
49025 // For example:
49026 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49027 // demanded, because we are swapping around the result can change.
49028 //
49029 // To address this, we check that we can scale the shuffle mask to MOVMSK
49030 // element width (this will ensure "high" elements match). Its slightly overly
49031 // conservative, but fine for an edge case fold.
49032 SmallVector<int, 32> ShuffleMask;
49033 SmallVector<SDValue, 2> ShuffleInputs;
49034 if (NumElts <= CmpBits &&
49035 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49036 ShuffleMask, DAG) &&
49037 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49038 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49039 canScaleShuffleElements(ShuffleMask, NumElts)) {
49040 SDLoc DL(EFLAGS);
49041 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49042 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49043 Result =
49044 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49045 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49046 }
49047
49048 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49049 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49050 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49051 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49052 // iff every element is referenced.
49053 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49054 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49055 (NumEltBits == 32 || NumEltBits == 64)) {
49056 SDLoc DL(EFLAGS);
49057 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49058 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49059 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49060 SDValue LHS = Vec;
49061 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49062 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49063 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49064 DAG.getBitcast(FloatVT, LHS),
49065 DAG.getBitcast(FloatVT, RHS));
49066 }
49067
49068 return SDValue();
49069}
49070
49071/// Optimize an EFLAGS definition used according to the condition code \p CC
49072/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49073/// uses of chain values.
49075 SelectionDAG &DAG,
49076 const X86Subtarget &Subtarget) {
49077 if (CC == X86::COND_B)
49078 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49079 return Flags;
49080
49081 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49082 return R;
49083
49084 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49085 return R;
49086
49087 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49088 return R;
49089
49090 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49091 return R;
49092
49093 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49094}
49095
49096/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49099 const X86Subtarget &Subtarget) {
49100 SDLoc DL(N);
49101 EVT VT = N->getValueType(0);
49102 SDValue FalseOp = N->getOperand(0);
49103 SDValue TrueOp = N->getOperand(1);
49104 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49105 SDValue Cond = N->getOperand(3);
49106
49107 // cmov X, X, ?, ? --> X
49108 if (TrueOp == FalseOp)
49109 return TrueOp;
49110
49111 // Try to simplify the EFLAGS and condition code operands.
49112 // We can't always do this as FCMOV only supports a subset of X86 cond.
49113 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49114 if (!(FalseOp.getValueType() == MVT::f80 ||
49115 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49116 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49117 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49118 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49119 Flags};
49120 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49121 }
49122 }
49123
49124 // If this is a select between two integer constants, try to do some
49125 // optimizations. Note that the operands are ordered the opposite of SELECT
49126 // operands.
49127 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49128 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49129 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49130 // larger than FalseC (the false value).
49131 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49133 std::swap(TrueC, FalseC);
49134 std::swap(TrueOp, FalseOp);
49135 }
49136
49137 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49138 // This is efficient for any integer data type (including i8/i16) and
49139 // shift amount.
49140 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49141 Cond = getSETCC(CC, Cond, DL, DAG);
49142
49143 // Zero extend the condition if needed.
49144 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49145
49146 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49147 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49148 DAG.getConstant(ShAmt, DL, MVT::i8));
49149 return Cond;
49150 }
49151
49152 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49153 // for any integer data type, including i8/i16.
49154 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49155 Cond = getSETCC(CC, Cond, DL, DAG);
49156
49157 // Zero extend the condition if needed.
49159 FalseC->getValueType(0), Cond);
49160 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49161 SDValue(FalseC, 0));
49162 return Cond;
49163 }
49164
49165 // Optimize cases that will turn into an LEA instruction. This requires
49166 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49167 if (VT == MVT::i32 || VT == MVT::i64) {
49168 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49169 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49170 "Implicit constant truncation");
49171
49172 bool isFastMultiplier = false;
49173 if (Diff.ult(10)) {
49174 switch (Diff.getZExtValue()) {
49175 default: break;
49176 case 1: // result = add base, cond
49177 case 2: // result = lea base( , cond*2)
49178 case 3: // result = lea base(cond, cond*2)
49179 case 4: // result = lea base( , cond*4)
49180 case 5: // result = lea base(cond, cond*4)
49181 case 8: // result = lea base( , cond*8)
49182 case 9: // result = lea base(cond, cond*8)
49183 isFastMultiplier = true;
49184 break;
49185 }
49186 }
49187
49188 if (isFastMultiplier) {
49189 Cond = getSETCC(CC, Cond, DL ,DAG);
49190 // Zero extend the condition if needed.
49191 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49192 Cond);
49193 // Scale the condition by the difference.
49194 if (Diff != 1)
49195 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49196 DAG.getConstant(Diff, DL, Cond.getValueType()));
49197
49198 // Add the base if non-zero.
49199 if (FalseC->getAPIntValue() != 0)
49200 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49201 SDValue(FalseC, 0));
49202 return Cond;
49203 }
49204 }
49205 }
49206 }
49207
49208 // Handle these cases:
49209 // (select (x != c), e, c) -> select (x != c), e, x),
49210 // (select (x == c), c, e) -> select (x == c), x, e)
49211 // where the c is an integer constant, and the "select" is the combination
49212 // of CMOV and CMP.
49213 //
49214 // The rationale for this change is that the conditional-move from a constant
49215 // needs two instructions, however, conditional-move from a register needs
49216 // only one instruction.
49217 //
49218 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49219 // some instruction-combining opportunities. This opt needs to be
49220 // postponed as late as possible.
49221 //
49222 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49223 // the DCI.xxxx conditions are provided to postpone the optimization as
49224 // late as possible.
49225
49226 ConstantSDNode *CmpAgainst = nullptr;
49227 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49228 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49229 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49230
49231 if (CC == X86::COND_NE &&
49232 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49234 std::swap(TrueOp, FalseOp);
49235 }
49236
49237 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49238 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49239 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49240 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49241 }
49242 }
49243 }
49244
49245 // Transform:
49246 //
49247 // (cmov 1 T (uge T 2))
49248 //
49249 // to:
49250 //
49251 // (adc T 0 (sub T 1))
49252 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49253 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49254 SDValue Cond0 = Cond.getOperand(0);
49255 if (Cond0.getOpcode() == ISD::TRUNCATE)
49256 Cond0 = Cond0.getOperand(0);
49257 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49258 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49259 EVT CondVT = Cond->getValueType(0);
49260 // Subtract 1 and generate a carry.
49261 SDValue NewSub =
49262 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49263 DAG.getConstant(1, DL, CondVT));
49264 SDValue EFLAGS(NewSub.getNode(), 1);
49265 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49266 DAG.getConstant(0, DL, VT), EFLAGS);
49267 }
49268 }
49269
49270 // Fold and/or of setcc's to double CMOV:
49271 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49272 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49273 //
49274 // This combine lets us generate:
49275 // cmovcc1 (jcc1 if we don't have CMOV)
49276 // cmovcc2 (same)
49277 // instead of:
49278 // setcc1
49279 // setcc2
49280 // and/or
49281 // cmovne (jne if we don't have CMOV)
49282 // When we can't use the CMOV instruction, it might increase branch
49283 // mispredicts.
49284 // When we can use CMOV, or when there is no mispredict, this improves
49285 // throughput and reduces register pressure.
49286 //
49287 if (CC == X86::COND_NE) {
49288 SDValue Flags;
49289 X86::CondCode CC0, CC1;
49290 bool isAndSetCC;
49291 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49292 if (isAndSetCC) {
49293 std::swap(FalseOp, TrueOp);
49296 }
49297
49298 SDValue LOps[] = {FalseOp, TrueOp,
49299 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49300 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49301 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49302 Flags};
49303 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49304 return CMOV;
49305 }
49306 }
49307
49308 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49309 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49310 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49311 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49312 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49313 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49314 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49315 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49316 SDValue Add = TrueOp;
49317 SDValue Const = FalseOp;
49318 // Canonicalize the condition code for easier matching and output.
49319 if (CC == X86::COND_E)
49320 std::swap(Add, Const);
49321
49322 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49323 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49324 Add.getResNo() == 0 && Add.hasOneUse() &&
49325 Add.getOperand(1) == Cond.getOperand(0)) {
49326 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49327 Add.getOperand(1));
49328 }
49329
49330 // We might have replaced the constant in the cmov with the LHS of the
49331 // compare. If so change it to the RHS of the compare.
49332 if (Const == Cond.getOperand(0))
49333 Const = Cond.getOperand(1);
49334
49335 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49336 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49337 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49338 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49339 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49340 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49341 // This should constant fold.
49342 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49343 SDValue CMov =
49344 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49345 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49346 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49347 }
49348 }
49349
49350 return SDValue();
49351}
49352
49353/// Different mul shrinking modes.
49355
49357 EVT VT = N->getOperand(0).getValueType();
49358 if (VT.getScalarSizeInBits() != 32)
49359 return false;
49360
49361 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49362 unsigned SignBits[2] = {1, 1};
49363 bool IsPositive[2] = {false, false};
49364 for (unsigned i = 0; i < 2; i++) {
49365 SDValue Opd = N->getOperand(i);
49366
49367 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49368 IsPositive[i] = DAG.SignBitIsZero(Opd);
49369 }
49370
49371 bool AllPositive = IsPositive[0] && IsPositive[1];
49372 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49373 // When ranges are from -128 ~ 127, use MULS8 mode.
49374 if (MinSignBits >= 25)
49376 // When ranges are from 0 ~ 255, use MULU8 mode.
49377 else if (AllPositive && MinSignBits >= 24)
49379 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49380 else if (MinSignBits >= 17)
49382 // When ranges are from 0 ~ 65535, use MULU16 mode.
49383 else if (AllPositive && MinSignBits >= 16)
49385 else
49386 return false;
49387 return true;
49388}
49389
49390/// When the operands of vector mul are extended from smaller size values,
49391/// like i8 and i16, the type of mul may be shrinked to generate more
49392/// efficient code. Two typical patterns are handled:
49393/// Pattern1:
49394/// %2 = sext/zext <N x i8> %1 to <N x i32>
49395/// %4 = sext/zext <N x i8> %3 to <N x i32>
49396// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49397/// %5 = mul <N x i32> %2, %4
49398///
49399/// Pattern2:
49400/// %2 = zext/sext <N x i16> %1 to <N x i32>
49401/// %4 = zext/sext <N x i16> %3 to <N x i32>
49402/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49403/// %5 = mul <N x i32> %2, %4
49404///
49405/// There are four mul shrinking modes:
49406/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49407/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49408/// generate pmullw+sext32 for it (MULS8 mode).
49409/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49410/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49411/// generate pmullw+zext32 for it (MULU8 mode).
49412/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49413/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49414/// generate pmullw+pmulhw for it (MULS16 mode).
49415/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49416/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49417/// generate pmullw+pmulhuw for it (MULU16 mode).
49419 const X86Subtarget &Subtarget) {
49420 // Check for legality
49421 // pmullw/pmulhw are not supported by SSE.
49422 if (!Subtarget.hasSSE2())
49423 return SDValue();
49424
49425 // Check for profitability
49426 // pmulld is supported since SSE41. It is better to use pmulld
49427 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49428 // the expansion.
49429 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49430 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49431 return SDValue();
49432
49434 if (!canReduceVMulWidth(N, DAG, Mode))
49435 return SDValue();
49436
49437 SDValue N0 = N->getOperand(0);
49438 SDValue N1 = N->getOperand(1);
49439 EVT VT = N->getOperand(0).getValueType();
49440 unsigned NumElts = VT.getVectorNumElements();
49441 if ((NumElts % 2) != 0)
49442 return SDValue();
49443
49444 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49445
49446 // Shrink the operands of mul.
49447 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49448 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49449
49450 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49451 // lower part is needed.
49452 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49456 DL, VT, MulLo);
49457
49458 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49459 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49460 // the higher part is also needed.
49461 SDValue MulHi =
49463 ReducedVT, NewN0, NewN1);
49464
49465 // Repack the lower part and higher part result of mul into a wider
49466 // result.
49467 // Generate shuffle functioning as punpcklwd.
49468 SmallVector<int, 16> ShuffleMask(NumElts);
49469 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49470 ShuffleMask[2 * i] = i;
49471 ShuffleMask[2 * i + 1] = i + NumElts;
49472 }
49473 SDValue ResLo =
49474 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49475 ResLo = DAG.getBitcast(ResVT, ResLo);
49476 // Generate shuffle functioning as punpckhwd.
49477 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49478 ShuffleMask[2 * i] = i + NumElts / 2;
49479 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49480 }
49481 SDValue ResHi =
49482 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49483 ResHi = DAG.getBitcast(ResVT, ResHi);
49484 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49485}
49486
49488 EVT VT, const SDLoc &DL) {
49489
49490 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49491 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49492 DAG.getConstant(Mult, DL, VT));
49493 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49494 DAG.getConstant(Shift, DL, MVT::i8));
49495 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49496 N->getOperand(0));
49497 return Result;
49498 };
49499
49500 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49501 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49502 DAG.getConstant(Mul1, DL, VT));
49503 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49504 DAG.getConstant(Mul2, DL, VT));
49505 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49506 N->getOperand(0));
49507 return Result;
49508 };
49509
49510 switch (MulAmt) {
49511 default:
49512 break;
49513 case 11:
49514 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49515 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49516 case 21:
49517 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49518 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49519 case 41:
49520 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49521 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49522 case 22:
49523 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49524 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49525 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49526 case 19:
49527 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49528 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49529 case 37:
49530 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49531 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49532 case 73:
49533 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49534 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49535 case 13:
49536 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49537 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49538 case 23:
49539 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49540 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49541 case 26:
49542 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49543 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49544 case 28:
49545 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49546 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49547 case 29:
49548 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49549 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49550 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49551 }
49552
49553 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49554 // by a single LEA.
49555 // First check if this a sum of two power of 2s because that's easy. Then
49556 // count how many zeros are up to the first bit.
49557 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49558 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49559 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49560 if (ScaleShift >= 1 && ScaleShift < 4) {
49561 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49562 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49563 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49564 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49565 DAG.getConstant(ScaleShift, DL, MVT::i8));
49566 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49567 }
49568 }
49569
49570 return SDValue();
49571}
49572
49573// If the upper 17 bits of either element are zero and the other element are
49574// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49575// PMULLD, except on KNL.
49577 SelectionDAG &DAG,
49578 const X86Subtarget &Subtarget) {
49579 if (!Subtarget.hasSSE2())
49580 return SDValue();
49581
49582 if (Subtarget.isPMADDWDSlow())
49583 return SDValue();
49584
49585 EVT VT = N->getValueType(0);
49586
49587 // Only support vXi32 vectors.
49588 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49589 return SDValue();
49590
49591 // Make sure the type is legal or can split/widen to a legal type.
49592 // With AVX512 but without BWI, we would need to split v32i16.
49593 unsigned NumElts = VT.getVectorNumElements();
49594 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49595 return SDValue();
49596
49597 // With AVX512 but without BWI, we would need to split v32i16.
49598 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49599 return SDValue();
49600
49601 SDValue N0 = N->getOperand(0);
49602 SDValue N1 = N->getOperand(1);
49603
49604 // If we are zero/sign extending two steps without SSE4.1, its better to
49605 // reduce the vmul width instead.
49606 if (!Subtarget.hasSSE41() &&
49607 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49608 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49609 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49610 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49611 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49612 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49613 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49614 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49615 return SDValue();
49616
49617 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49618 // the vmul width instead.
49619 if (!Subtarget.hasSSE41() &&
49620 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49621 N0.getOperand(0).getValueSizeInBits() > 128) &&
49622 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49623 N1.getOperand(0).getValueSizeInBits() > 128))
49624 return SDValue();
49625
49626 // Sign bits must extend down to the lowest i16.
49627 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49628 DAG.ComputeMaxSignificantBits(N0) > 16)
49629 return SDValue();
49630
49631 // At least one of the elements must be zero in the upper 17 bits, or can be
49632 // safely made zero without altering the final result.
49633 auto GetZeroableOp = [&](SDValue Op) {
49634 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49635 if (DAG.MaskedValueIsZero(Op, Mask17))
49636 return Op;
49637 // Mask off upper 16-bits of sign-extended constants.
49639 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49640 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49641 SDValue Src = Op.getOperand(0);
49642 // Convert sext(vXi16) to zext(vXi16).
49643 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49644 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49645 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49646 // which will expand the extension.
49647 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49648 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49649 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49650 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49651 }
49652 }
49653 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49654 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49655 N->isOnlyUserOf(Op.getNode())) {
49656 SDValue Src = Op.getOperand(0);
49657 if (Src.getScalarValueSizeInBits() == 16)
49658 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49659 }
49660 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49661 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49662 N->isOnlyUserOf(Op.getNode())) {
49663 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49664 Op.getOperand(1));
49665 }
49666 return SDValue();
49667 };
49668 SDValue ZeroN0 = GetZeroableOp(N0);
49669 SDValue ZeroN1 = GetZeroableOp(N1);
49670 if (!ZeroN0 && !ZeroN1)
49671 return SDValue();
49672 N0 = ZeroN0 ? ZeroN0 : N0;
49673 N1 = ZeroN1 ? ZeroN1 : N1;
49674
49675 // Use SplitOpsAndApply to handle AVX splitting.
49676 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49678 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49679 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49680 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49681 DAG.getBitcast(OpVT, Ops[0]),
49682 DAG.getBitcast(OpVT, Ops[1]));
49683 };
49684 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49685}
49686
49688 const X86Subtarget &Subtarget) {
49689 if (!Subtarget.hasSSE2())
49690 return SDValue();
49691
49692 EVT VT = N->getValueType(0);
49693
49694 // Only support vXi64 vectors.
49695 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49696 VT.getVectorNumElements() < 2 ||
49698 return SDValue();
49699
49700 SDValue N0 = N->getOperand(0);
49701 SDValue N1 = N->getOperand(1);
49702
49703 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49704 // 32-bits. We can lower with this if the sign bits stretch that far.
49705 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49706 DAG.ComputeNumSignBits(N1) > 32) {
49707 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49709 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49710 };
49711 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49712 /*CheckBWI*/ false);
49713 }
49714
49715 // If the upper bits are zero we can use a single pmuludq.
49716 APInt Mask = APInt::getHighBitsSet(64, 32);
49717 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49718 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49720 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49721 };
49722 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49723 /*CheckBWI*/ false);
49724 }
49725
49726 return SDValue();
49727}
49728
49731 const X86Subtarget &Subtarget) {
49732 EVT VT = N->getValueType(0);
49733 SDLoc DL(N);
49734
49735 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49736 return V;
49737
49738 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49739 return V;
49740
49741 if (DCI.isBeforeLegalize() && VT.isVector())
49742 return reduceVMULWidth(N, DL, DAG, Subtarget);
49743
49744 if (VT != MVT::i64 && VT != MVT::i32 &&
49745 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49746 return SDValue();
49747
49748 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49749 if (!Known1.isConstant())
49750 return SDValue();
49751
49752 const APInt &C = Known1.getConstant();
49753 if (C.isZero())
49754 return DAG.getConstant(0, DL, VT);
49755
49756 if (C.isAllOnes())
49757 return DAG.getNegative(N->getOperand(0), DL, VT);
49758
49759 if (isPowerOf2_64(C.getZExtValue()))
49760 return SDValue();
49761
49762 // Optimize a single multiply with constant into two operations in order to
49763 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49765 return SDValue();
49766
49767 // An imul is usually smaller than the alternative sequence.
49769 return SDValue();
49770
49771 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49772 return SDValue();
49773
49774 int64_t SignMulAmt = C.getSExtValue();
49775 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49776 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49777
49778 SDValue NewMul = SDValue();
49779 if (VT == MVT::i64 || VT == MVT::i32) {
49780 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49781 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49782 DAG.getConstant(AbsMulAmt, DL, VT));
49783 if (SignMulAmt < 0)
49784 NewMul = DAG.getNegative(NewMul, DL, VT);
49785
49786 return NewMul;
49787 }
49788
49789 uint64_t MulAmt1 = 0;
49790 uint64_t MulAmt2 = 0;
49791 if ((AbsMulAmt % 9) == 0) {
49792 MulAmt1 = 9;
49793 MulAmt2 = AbsMulAmt / 9;
49794 } else if ((AbsMulAmt % 5) == 0) {
49795 MulAmt1 = 5;
49796 MulAmt2 = AbsMulAmt / 5;
49797 } else if ((AbsMulAmt % 3) == 0) {
49798 MulAmt1 = 3;
49799 MulAmt2 = AbsMulAmt / 3;
49800 }
49801
49802 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49803 if (MulAmt2 &&
49804 (isPowerOf2_64(MulAmt2) ||
49805 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49806
49807 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49808 N->user_begin()->getOpcode() == ISD::ADD))
49809 // If second multiplifer is pow2, issue it first. We want the multiply
49810 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49811 // use is an add. Only do this for positive multiply amounts since the
49812 // negate would prevent it from being used as an address mode anyway.
49813 std::swap(MulAmt1, MulAmt2);
49814
49815 if (isPowerOf2_64(MulAmt1))
49816 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49817 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49818 else
49819 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49820 DAG.getConstant(MulAmt1, DL, VT));
49821
49822 if (isPowerOf2_64(MulAmt2))
49823 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49824 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49825 else
49826 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49827 DAG.getConstant(MulAmt2, DL, VT));
49828
49829 // Negate the result.
49830 if (SignMulAmt < 0)
49831 NewMul = DAG.getNegative(NewMul, DL, VT);
49832 } else if (!Subtarget.slowLEA())
49833 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49834 }
49835 if (!NewMul) {
49836 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49837 if (isPowerOf2_64(AbsMulAmt - 1)) {
49838 // (mul x, 2^N + 1) => (add (shl x, N), x)
49839 NewMul = DAG.getNode(
49840 ISD::ADD, DL, VT, N->getOperand(0),
49841 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49842 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49843 if (SignMulAmt < 0)
49844 NewMul = DAG.getNegative(NewMul, DL, VT);
49845 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49846 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49847 NewMul =
49848 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49849 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49850 // To negate, reverse the operands of the subtract.
49851 if (SignMulAmt < 0)
49852 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49853 else
49854 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49855 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49856 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49857 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49858 NewMul =
49859 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49860 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49861 NewMul = DAG.getNode(
49862 ISD::ADD, DL, VT, NewMul,
49863 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49864 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49865 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49866 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49867 NewMul =
49868 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49869 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49870 NewMul = DAG.getNode(
49871 ISD::SUB, DL, VT, NewMul,
49872 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49873 } else if (SignMulAmt >= 0 && VT.isVector() &&
49874 Subtarget.fastImmVectorShift()) {
49875 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49876 uint64_t ShiftAmt1;
49877 std::optional<unsigned> Opc;
49878 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49879 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49880 Opc = ISD::ADD;
49881 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49882 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49883 Opc = ISD::SUB;
49884 }
49885
49886 if (Opc) {
49887 SDValue Shift1 =
49888 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49889 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49890 SDValue Shift2 =
49891 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49892 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49893 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49894 }
49895 }
49896 }
49897
49898 return NewMul;
49899}
49900
49901// Try to form a MULHU or MULHS node by looking for
49902// (srl (mul ext, ext), 16)
49903// TODO: This is X86 specific because we want to be able to handle wide types
49904// before type legalization. But we can only do it if the vector will be
49905// legalized via widening/splitting. Type legalization can't handle promotion
49906// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49907// combiner.
49909 const SDLoc &DL,
49910 const X86Subtarget &Subtarget) {
49911 using namespace SDPatternMatch;
49912 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49913 "SRL or SRA node is required here!");
49914
49915 if (!Subtarget.hasSSE2())
49916 return SDValue();
49917
49918 // Input type should be at least vXi32.
49919 EVT VT = N->getValueType(0);
49920 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49921 return SDValue();
49922
49923 // The operation must be a multiply shifted right by 16.
49924 SDValue LHS, RHS;
49925 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49926 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49927 return SDValue();
49928
49929 unsigned ExtOpc = LHS.getOpcode();
49930 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49931 RHS.getOpcode() != ExtOpc)
49932 return SDValue();
49933
49934 // Peek through the extends.
49935 LHS = LHS.getOperand(0);
49936 RHS = RHS.getOperand(0);
49937
49938 // Ensure the input types match.
49939 EVT MulVT = LHS.getValueType();
49940 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49941 return SDValue();
49942
49943 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49944 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49945
49946 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49947 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49948}
49949
49951 const X86Subtarget &Subtarget) {
49952 using namespace llvm::SDPatternMatch;
49953 SDValue N0 = N->getOperand(0);
49954 SDValue N1 = N->getOperand(1);
49956 EVT VT = N0.getValueType();
49957 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49958 SDLoc DL(N);
49959
49960 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
49961 // with out-of-bounds clamping.
49962 if (N0.getOpcode() == ISD::VSELECT &&
49963 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
49964 SDValue Cond = N0.getOperand(0);
49965 SDValue N00 = N0.getOperand(1);
49966 SDValue N01 = N0.getOperand(2);
49967 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
49969 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49971 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
49972 }
49973 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
49975 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
49977 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
49978 }
49979 }
49980
49981 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49982 // since the result of setcc_c is all zero's or all ones.
49983 if (VT.isInteger() && !VT.isVector() &&
49984 N1C && N0.getOpcode() == ISD::AND &&
49985 N0.getOperand(1).getOpcode() == ISD::Constant) {
49986 SDValue N00 = N0.getOperand(0);
49987 APInt Mask = N0.getConstantOperandAPInt(1);
49988 Mask <<= N1C->getAPIntValue();
49989 bool MaskOK = false;
49990 // We can handle cases concerning bit-widening nodes containing setcc_c if
49991 // we carefully interrogate the mask to make sure we are semantics
49992 // preserving.
49993 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49994 // of the underlying setcc_c operation if the setcc_c was zero extended.
49995 // Consider the following example:
49996 // zext(setcc_c) -> i32 0x0000FFFF
49997 // c1 -> i32 0x0000FFFF
49998 // c2 -> i32 0x00000001
49999 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50000 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50001 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50002 MaskOK = true;
50003 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50005 MaskOK = true;
50006 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50007 N00.getOpcode() == ISD::ANY_EXTEND) &&
50009 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50010 }
50011 if (MaskOK && Mask != 0)
50012 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50013 }
50014
50015 return SDValue();
50016}
50017
50019 const X86Subtarget &Subtarget) {
50020 using namespace llvm::SDPatternMatch;
50021 SDValue N0 = N->getOperand(0);
50022 SDValue N1 = N->getOperand(1);
50023 EVT VT = N0.getValueType();
50024 unsigned Size = VT.getSizeInBits();
50025 SDLoc DL(N);
50026
50027 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50028 return V;
50029
50030 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50031 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50032 SDValue ShrAmtVal;
50033 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50035 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50036 }
50037
50038 // fold (SRA (SHL X, ShlConst), SraConst)
50039 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50040 // or (sext_in_reg X)
50041 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50042 // depending on relation between SraConst and ShlConst.
50043 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50044 // us to do the sext_in_reg from corresponding bit.
50045
50046 // sexts in X86 are MOVs. The MOVs have the same code size
50047 // as above SHIFTs (only SHIFT on 1 has lower code size).
50048 // However the MOVs have 2 advantages to a SHIFT:
50049 // 1. MOVs can write to a register that differs from source
50050 // 2. MOVs accept memory operands
50051
50052 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50053 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50055 return SDValue();
50056
50057 SDValue N00 = N0.getOperand(0);
50058 SDValue N01 = N0.getOperand(1);
50059 APInt ShlConst = N01->getAsAPIntVal();
50060 APInt SraConst = N1->getAsAPIntVal();
50061 EVT CVT = N1.getValueType();
50062
50063 if (CVT != N01.getValueType())
50064 return SDValue();
50065 if (SraConst.isNegative())
50066 return SDValue();
50067
50068 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50069 unsigned ShiftSize = SVT.getSizeInBits();
50070 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50071 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50072 continue;
50073 SDValue NN =
50074 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50075 if (SraConst.eq(ShlConst))
50076 return NN;
50077 if (SraConst.ult(ShlConst))
50078 return DAG.getNode(ISD::SHL, DL, VT, NN,
50079 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50080 return DAG.getNode(ISD::SRA, DL, VT, NN,
50081 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50082 }
50083 return SDValue();
50084}
50085
50088 const X86Subtarget &Subtarget) {
50089 using namespace llvm::SDPatternMatch;
50090 SDValue N0 = N->getOperand(0);
50091 SDValue N1 = N->getOperand(1);
50092 EVT VT = N0.getValueType();
50093 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50094 SDLoc DL(N);
50095
50096 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50097 return V;
50098
50099 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50100 // with out-of-bounds clamping.
50101 if (N0.getOpcode() == ISD::VSELECT &&
50102 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50103 SDValue Cond = N0.getOperand(0);
50104 SDValue N00 = N0.getOperand(1);
50105 SDValue N01 = N0.getOperand(2);
50106 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50108 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50110 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50111 }
50112 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50114 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50116 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50117 }
50118 }
50119
50120 // Only do this on the last DAG combine as it can interfere with other
50121 // combines.
50122 if (!DCI.isAfterLegalizeDAG())
50123 return SDValue();
50124
50125 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50126 // TODO: This is a generic DAG combine that became an x86-only combine to
50127 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50128 // and-not ('andn').
50129 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50130 return SDValue();
50131
50132 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50133 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50134 if (!ShiftC || !AndC)
50135 return SDValue();
50136
50137 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50138 // transform should reduce code size. It may also enable secondary transforms
50139 // from improved known-bits analysis or instruction selection.
50140 APInt MaskVal = AndC->getAPIntValue();
50141
50142 // If this can be matched by a zero extend, don't optimize.
50143 if (MaskVal.isMask()) {
50144 unsigned TO = MaskVal.countr_one();
50145 if (TO >= 8 && isPowerOf2_32(TO))
50146 return SDValue();
50147 }
50148
50149 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50150 unsigned OldMaskSize = MaskVal.getSignificantBits();
50151 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50152 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50153 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50154 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50155 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50156 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50157 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50158 }
50159 return SDValue();
50160}
50161
50163 const X86Subtarget &Subtarget) {
50164 unsigned Opcode = N->getOpcode();
50165 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50166
50167 SDLoc DL(N);
50168 EVT VT = N->getValueType(0);
50169 SDValue N0 = N->getOperand(0);
50170 SDValue N1 = N->getOperand(1);
50171 EVT SrcVT = N0.getValueType();
50172
50173 SDValue BC0 =
50174 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50175 SDValue BC1 =
50176 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50177
50178 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50179 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50180 // truncation trees that help us avoid lane crossing shuffles.
50181 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50182 // TODO: We don't handle vXf64 shuffles yet.
50183 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50184 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50186 SmallVector<int> ShuffleMask, ScaledMask;
50187 SDValue Vec = peekThroughBitcasts(BCSrc);
50188 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50190 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50191 // shuffle to a v4X64 width - we can probably relax this in the future.
50192 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50193 ShuffleOps[0].getValueType().is256BitVector() &&
50194 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50195 SDValue Lo, Hi;
50196 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50197 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50198 Lo = DAG.getBitcast(SrcVT, Lo);
50199 Hi = DAG.getBitcast(SrcVT, Hi);
50200 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50201 Res = DAG.getBitcast(ShufVT, Res);
50202 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50203 return DAG.getBitcast(VT, Res);
50204 }
50205 }
50206 }
50207 }
50208
50209 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50210 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50211 // If either/both ops are a shuffle that can scale to v2x64,
50212 // then see if we can perform this as a v4x32 post shuffle.
50213 SmallVector<SDValue> Ops0, Ops1;
50214 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50215 bool IsShuf0 =
50216 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50217 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50218 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50219 bool IsShuf1 =
50220 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50221 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50222 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50223 if (IsShuf0 || IsShuf1) {
50224 if (!IsShuf0) {
50225 Ops0.assign({BC0});
50226 ScaledMask0.assign({0, 1});
50227 }
50228 if (!IsShuf1) {
50229 Ops1.assign({BC1});
50230 ScaledMask1.assign({0, 1});
50231 }
50232
50233 SDValue LHS, RHS;
50234 int PostShuffle[4] = {-1, -1, -1, -1};
50235 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50236 if (M < 0)
50237 return true;
50238 Idx = M % 2;
50239 SDValue Src = Ops[M / 2];
50240 if (!LHS || LHS == Src) {
50241 LHS = Src;
50242 return true;
50243 }
50244 if (!RHS || RHS == Src) {
50245 Idx += 2;
50246 RHS = Src;
50247 return true;
50248 }
50249 return false;
50250 };
50251 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50252 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50253 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50254 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50255 LHS = DAG.getBitcast(SrcVT, LHS);
50256 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50257 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50258 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50259 Res = DAG.getBitcast(ShufVT, Res);
50260 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50261 return DAG.getBitcast(VT, Res);
50262 }
50263 }
50264 }
50265
50266 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50267 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50268 SmallVector<int> Mask0, Mask1;
50269 SmallVector<SDValue> Ops0, Ops1;
50270 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50271 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50272 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50273 !Ops0.empty() && !Ops1.empty() &&
50274 all_of(Ops0,
50275 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50276 all_of(Ops1,
50277 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50278 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50279 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50280 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50281 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50282 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50283 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50284 if ((Op00 == Op11) && (Op01 == Op10)) {
50285 std::swap(Op10, Op11);
50287 }
50288 if ((Op00 == Op10) && (Op01 == Op11)) {
50289 const int Map[4] = {0, 2, 1, 3};
50290 SmallVector<int, 4> ShuffleMask(
50291 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50292 Map[ScaledMask1[1]]});
50293 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50294 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50295 DAG.getBitcast(SrcVT, Op01));
50296 Res = DAG.getBitcast(ShufVT, Res);
50297 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50298 return DAG.getBitcast(VT, Res);
50299 }
50300 }
50301 }
50302
50303 return SDValue();
50304}
50305
50308 const X86Subtarget &Subtarget) {
50309 unsigned Opcode = N->getOpcode();
50310 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50311 "Unexpected pack opcode");
50312
50313 EVT VT = N->getValueType(0);
50314 SDValue N0 = N->getOperand(0);
50315 SDValue N1 = N->getOperand(1);
50316 unsigned NumDstElts = VT.getVectorNumElements();
50317 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50318 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50319 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50320 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50321 "Unexpected PACKSS/PACKUS input type");
50322
50323 bool IsSigned = (X86ISD::PACKSS == Opcode);
50324
50325 // Constant Folding.
50326 APInt UndefElts0, UndefElts1;
50327 SmallVector<APInt, 32> EltBits0, EltBits1;
50328 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50329 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50330 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50331 /*AllowWholeUndefs*/ true,
50332 /*AllowPartialUndefs*/ true) &&
50333 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50334 /*AllowWholeUndefs*/ true,
50335 /*AllowPartialUndefs*/ true)) {
50336 unsigned NumLanes = VT.getSizeInBits() / 128;
50337 unsigned NumSrcElts = NumDstElts / 2;
50338 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50339 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50340
50341 APInt Undefs(NumDstElts, 0);
50342 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50343 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50344 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50345 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50346 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50347 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50348
50349 if (UndefElts[SrcIdx]) {
50350 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50351 continue;
50352 }
50353
50354 APInt &Val = EltBits[SrcIdx];
50355 if (IsSigned) {
50356 // PACKSS: Truncate signed value with signed saturation.
50357 // Source values less than dst minint are saturated to minint.
50358 // Source values greater than dst maxint are saturated to maxint.
50359 Val = Val.truncSSat(DstBitsPerElt);
50360 } else {
50361 // PACKUS: Truncate signed value with unsigned saturation.
50362 // Source values less than zero are saturated to zero.
50363 // Source values greater than dst maxuint are saturated to maxuint.
50364 // NOTE: This is different from APInt::truncUSat.
50365 if (Val.isIntN(DstBitsPerElt))
50366 Val = Val.trunc(DstBitsPerElt);
50367 else if (Val.isNegative())
50368 Val = APInt::getZero(DstBitsPerElt);
50369 else
50370 Val = APInt::getAllOnes(DstBitsPerElt);
50371 }
50372 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50373 }
50374 }
50375
50376 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50377 }
50378
50379 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50380 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50381 return V;
50382
50383 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50384 // Currently limit this to allsignbits cases only.
50385 if (IsSigned &&
50386 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50387 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50388 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50389 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50390 if (Not0 && Not1) {
50391 SDLoc DL(N);
50392 MVT SrcVT = N0.getSimpleValueType();
50393 SDValue Pack =
50394 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50395 DAG.getBitcast(SrcVT, Not1));
50396 return DAG.getNOT(DL, Pack, VT);
50397 }
50398 }
50399
50400 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50401 // truncate to create a larger truncate.
50402 if (Subtarget.hasAVX512() &&
50403 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50404 N0.getOperand(0).getValueType() == MVT::v8i32) {
50405 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50406 (!IsSigned &&
50407 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50408 if (Subtarget.hasVLX())
50409 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50410
50411 // Widen input to v16i32 so we can truncate that.
50412 SDLoc dl(N);
50413 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50414 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50415 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50416 }
50417 }
50418
50419 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50420 if (VT.is128BitVector()) {
50421 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50422 SDValue Src0, Src1;
50423 if (N0.getOpcode() == ExtOpc &&
50425 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50426 Src0 = N0.getOperand(0);
50427 }
50428 if (N1.getOpcode() == ExtOpc &&
50430 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50431 Src1 = N1.getOperand(0);
50432 }
50433 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50434 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50435 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50436 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50437 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50438 }
50439
50440 // Try again with pack(*_extend_vector_inreg, undef).
50441 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50443 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50444 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50445 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50446 DAG);
50447 }
50448
50449 // Attempt to combine as shuffle.
50450 SDValue Op(N, 0);
50451 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50452 return Res;
50453
50454 return SDValue();
50455}
50456
50459 const X86Subtarget &Subtarget) {
50460 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50461 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50462 "Unexpected horizontal add/sub opcode");
50463
50464 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50465 MVT VT = N->getSimpleValueType(0);
50466 SDValue LHS = N->getOperand(0);
50467 SDValue RHS = N->getOperand(1);
50468
50469 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50470 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50471 LHS.getOpcode() == RHS.getOpcode() &&
50472 LHS.getValueType() == RHS.getValueType() &&
50473 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50474 SDValue LHS0 = LHS.getOperand(0);
50475 SDValue LHS1 = LHS.getOperand(1);
50476 SDValue RHS0 = RHS.getOperand(0);
50477 SDValue RHS1 = RHS.getOperand(1);
50478 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50479 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50480 SDLoc DL(N);
50481 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50482 LHS0.isUndef() ? LHS1 : LHS0,
50483 RHS0.isUndef() ? RHS1 : RHS0);
50484 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50485 Res = DAG.getBitcast(ShufVT, Res);
50486 SDValue NewLHS =
50487 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50488 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50489 SDValue NewRHS =
50490 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50491 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50492 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50493 DAG.getBitcast(VT, NewRHS));
50494 }
50495 }
50496 }
50497
50498 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50499 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50500 return V;
50501
50502 return SDValue();
50503}
50504
50507 const X86Subtarget &Subtarget) {
50508 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50509 X86ISD::VSRL == N->getOpcode()) &&
50510 "Unexpected shift opcode");
50511 EVT VT = N->getValueType(0);
50512 SDValue N0 = N->getOperand(0);
50513 SDValue N1 = N->getOperand(1);
50514
50515 // Shift zero -> zero.
50517 return DAG.getConstant(0, SDLoc(N), VT);
50518
50519 // Detect constant shift amounts.
50520 APInt UndefElts;
50521 SmallVector<APInt, 32> EltBits;
50522 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50523 /*AllowWholeUndefs*/ true,
50524 /*AllowPartialUndefs*/ false)) {
50525 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50526 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50527 EltBits[0].getZExtValue(), DAG);
50528 }
50529
50530 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50531 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50532 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50533 return SDValue(N, 0);
50534
50535 return SDValue();
50536}
50537
50540 const X86Subtarget &Subtarget) {
50541 unsigned Opcode = N->getOpcode();
50542 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50543 X86ISD::VSRLI == Opcode) &&
50544 "Unexpected shift opcode");
50545 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50546 EVT VT = N->getValueType(0);
50547 SDValue N0 = N->getOperand(0);
50548 SDValue N1 = N->getOperand(1);
50549 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50550 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50551 "Unexpected value type");
50552 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50553
50554 // (shift undef, X) -> 0
50555 if (N0.isUndef())
50556 return DAG.getConstant(0, SDLoc(N), VT);
50557
50558 // Out of range logical bit shifts are guaranteed to be zero.
50559 // Out of range arithmetic bit shifts splat the sign bit.
50560 unsigned ShiftVal = N->getConstantOperandVal(1);
50561 if (ShiftVal >= NumBitsPerElt) {
50562 if (LogicalShift)
50563 return DAG.getConstant(0, SDLoc(N), VT);
50564 ShiftVal = NumBitsPerElt - 1;
50565 }
50566
50567 // (shift X, 0) -> X
50568 if (!ShiftVal)
50569 return N0;
50570
50571 // (shift 0, C) -> 0
50573 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50574 // result are all zeros, not undef.
50575 return DAG.getConstant(0, SDLoc(N), VT);
50576
50577 // (VSRAI -1, C) -> -1
50578 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50579 // N0 is all ones or undef. We guarantee that the bits shifted into the
50580 // result are all ones, not undef.
50581 return DAG.getAllOnesConstant(SDLoc(N), VT);
50582
50583 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50584 unsigned NewShiftVal = Amt0 + Amt1;
50585 if (NewShiftVal >= NumBitsPerElt) {
50586 // Out of range logical bit shifts are guaranteed to be zero.
50587 // Out of range arithmetic bit shifts splat the sign bit.
50588 if (LogicalShift)
50589 return DAG.getConstant(0, SDLoc(N), VT);
50590 NewShiftVal = NumBitsPerElt - 1;
50591 }
50592 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50593 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50594 };
50595
50596 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50597 if (Opcode == N0.getOpcode())
50598 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50599
50600 // (shl (add X, X), C) -> (shl X, (C + 1))
50601 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50602 N0.getOperand(0) == N0.getOperand(1))
50603 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50604
50605 // We can decode 'whole byte' logical bit shifts as shuffles.
50606 if (LogicalShift && (ShiftVal % 8) == 0) {
50607 SDValue Op(N, 0);
50608 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50609 return Res;
50610 }
50611
50612 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50613 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50614 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50615 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50616 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50617 N0.getOpcode() == X86ISD::PSHUFD &&
50618 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50619 N0->hasOneUse()) {
50621 if (BC.getOpcode() == X86ISD::VSHLI &&
50622 BC.getScalarValueSizeInBits() == 64 &&
50623 BC.getConstantOperandVal(1) == 63) {
50624 SDLoc DL(N);
50625 SDValue Src = BC.getOperand(0);
50626 Src = DAG.getBitcast(VT, Src);
50627 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50628 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50629 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50630 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50631 return Src;
50632 }
50633 }
50634
50635 auto TryConstantFold = [&](SDValue V) {
50636 APInt UndefElts;
50637 SmallVector<APInt, 32> EltBits;
50638 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50639 /*AllowWholeUndefs*/ true,
50640 /*AllowPartialUndefs*/ true))
50641 return SDValue();
50642 assert(EltBits.size() == VT.getVectorNumElements() &&
50643 "Unexpected shift value type");
50644 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50645 // created an undef input due to no input bits being demanded, but user
50646 // still expects 0 in other bits.
50647 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50648 APInt &Elt = EltBits[i];
50649 if (UndefElts[i])
50650 Elt = 0;
50651 else if (X86ISD::VSHLI == Opcode)
50652 Elt <<= ShiftVal;
50653 else if (X86ISD::VSRAI == Opcode)
50654 Elt.ashrInPlace(ShiftVal);
50655 else
50656 Elt.lshrInPlace(ShiftVal);
50657 }
50658 // Reset undef elements since they were zeroed above.
50659 UndefElts = 0;
50660 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50661 };
50662
50663 // Constant Folding.
50664 if (N->isOnlyUserOf(N0.getNode())) {
50665 if (SDValue C = TryConstantFold(N0))
50666 return C;
50667
50668 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50669 // Don't break NOT patterns.
50671 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50672 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50674 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50675 SDLoc DL(N);
50676 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50677 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50678 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50679 }
50680 }
50681 }
50682
50683 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50684 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50685 DCI))
50686 return SDValue(N, 0);
50687
50688 return SDValue();
50689}
50690
50693 const X86Subtarget &Subtarget) {
50694 EVT VT = N->getValueType(0);
50695 unsigned Opcode = N->getOpcode();
50696 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50697 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50698 Opcode == ISD::INSERT_VECTOR_ELT) &&
50699 "Unexpected vector insertion");
50700
50701 SDValue Vec = N->getOperand(0);
50702 SDValue Scl = N->getOperand(1);
50703 SDValue Idx = N->getOperand(2);
50704
50705 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50706 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50707 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50708
50709 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50710 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50712 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50713 APInt::getAllOnes(NumBitsPerElt), DCI))
50714 return SDValue(N, 0);
50715 }
50716
50717 // Attempt to combine insertion patterns to a shuffle.
50718 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50719 SDValue Op(N, 0);
50720 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50721 return Res;
50722 }
50723
50724 return SDValue();
50725}
50726
50727/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50728/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50729/// OR -> CMPNEQSS.
50732 const X86Subtarget &Subtarget) {
50733 unsigned opcode;
50734
50735 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50736 // we're requiring SSE2 for both.
50737 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50738 SDValue N0 = N->getOperand(0);
50739 SDValue N1 = N->getOperand(1);
50740 SDValue CMP0 = N0.getOperand(1);
50741 SDValue CMP1 = N1.getOperand(1);
50742 SDLoc DL(N);
50743
50744 // The SETCCs should both refer to the same CMP.
50745 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50746 return SDValue();
50747
50748 SDValue CMP00 = CMP0->getOperand(0);
50749 SDValue CMP01 = CMP0->getOperand(1);
50750 EVT VT = CMP00.getValueType();
50751
50752 if (VT == MVT::f32 || VT == MVT::f64 ||
50753 (VT == MVT::f16 && Subtarget.hasFP16())) {
50754 bool ExpectingFlags = false;
50755 // Check for any users that want flags:
50756 for (const SDNode *U : N->users()) {
50757 if (ExpectingFlags)
50758 break;
50759
50760 switch (U->getOpcode()) {
50761 default:
50762 case ISD::BR_CC:
50763 case ISD::BRCOND:
50764 case ISD::SELECT:
50765 ExpectingFlags = true;
50766 break;
50767 case ISD::CopyToReg:
50768 case ISD::SIGN_EXTEND:
50769 case ISD::ZERO_EXTEND:
50770 case ISD::ANY_EXTEND:
50771 break;
50772 }
50773 }
50774
50775 if (!ExpectingFlags) {
50776 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50777 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50778
50779 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50780 X86::CondCode tmp = cc0;
50781 cc0 = cc1;
50782 cc1 = tmp;
50783 }
50784
50785 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50786 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50787 // FIXME: need symbolic constants for these magic numbers.
50788 // See X86ATTInstPrinter.cpp:printSSECC().
50789 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50790 if (Subtarget.hasAVX512()) {
50791 SDValue FSetCC =
50792 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50793 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50794 // Need to fill with zeros to ensure the bitcast will produce zeroes
50795 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50796 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50797 DAG.getConstant(0, DL, MVT::v16i1),
50798 FSetCC, DAG.getVectorIdxConstant(0, DL));
50799 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50800 N->getSimpleValueType(0));
50801 }
50802 SDValue OnesOrZeroesF =
50803 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50804 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50805
50806 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50807 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50808
50809 if (is64BitFP && !Subtarget.is64Bit()) {
50810 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50811 // 64-bit integer, since that's not a legal type. Since
50812 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50813 // bits, but can do this little dance to extract the lowest 32 bits
50814 // and work with those going forward.
50815 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50816 MVT::v2f64, OnesOrZeroesF);
50817 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50818 OnesOrZeroesF =
50819 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50820 DAG.getVectorIdxConstant(0, DL));
50821 IntVT = MVT::i32;
50822 }
50823
50824 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50825 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50826 DAG.getConstant(1, DL, IntVT));
50827 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50828 ANDed);
50829 return OneBitOfTruth;
50830 }
50831 }
50832 }
50833 }
50834 return SDValue();
50835}
50836
50837/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50839 SelectionDAG &DAG) {
50840 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50841
50842 MVT VT = N->getSimpleValueType(0);
50843 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50844 return SDValue();
50845
50846 SDValue X, Y;
50847 SDValue N0 = N->getOperand(0);
50848 SDValue N1 = N->getOperand(1);
50849
50850 if (SDValue Not = IsNOT(N0, DAG)) {
50851 X = Not;
50852 Y = N1;
50853 } else if (SDValue Not = IsNOT(N1, DAG)) {
50854 X = Not;
50855 Y = N0;
50856 } else
50857 return SDValue();
50858
50859 X = DAG.getBitcast(VT, X);
50860 Y = DAG.getBitcast(VT, Y);
50861 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50862}
50863
50864/// Try to fold:
50865/// and (vector_shuffle<Z,...,Z>
50866/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50867/// ->
50868/// andnp (vector_shuffle<Z,...,Z>
50869/// (insert_vector_elt undef, X, Z), undef), Y
50871 const X86Subtarget &Subtarget) {
50872 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50873
50874 EVT VT = N->getValueType(0);
50875 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50876 // value and require extra moves.
50877 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50878 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50879 return SDValue();
50880
50881 auto GetNot = [&DAG](SDValue V) {
50883 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50884 // end-users are ISD::AND including cases
50885 // (and(extract_vector_element(SVN), Y)).
50886 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50887 !SVN->getOperand(1).isUndef()) {
50888 return SDValue();
50889 }
50890 SDValue IVEN = SVN->getOperand(0);
50891 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50892 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50893 return SDValue();
50894 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50895 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50896 return SDValue();
50897 SDValue Src = IVEN.getOperand(1);
50898 if (SDValue Not = IsNOT(Src, DAG)) {
50899 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50900 SDValue NotIVEN =
50902 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50903 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50904 SVN->getOperand(1), SVN->getMask());
50905 }
50906 return SDValue();
50907 };
50908
50909 SDValue X, Y;
50910 SDValue N0 = N->getOperand(0);
50911 SDValue N1 = N->getOperand(1);
50912 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50913
50914 if (SDValue Not = GetNot(N0)) {
50915 X = Not;
50916 Y = N1;
50917 } else if (SDValue Not = GetNot(N1)) {
50918 X = Not;
50919 Y = N0;
50920 } else
50921 return SDValue();
50922
50923 X = DAG.getBitcast(VT, X);
50924 Y = DAG.getBitcast(VT, Y);
50925 SDLoc DL(N);
50926
50927 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50928 // AVX2.
50929 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50931 SDValue LoX, HiX;
50932 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50933 SDValue LoY, HiY;
50934 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50935 EVT SplitVT = LoX.getValueType();
50936 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50937 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50938 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50939 }
50940
50941 if (TLI.isTypeLegal(VT))
50942 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50943
50944 return SDValue();
50945}
50946
50947// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50948// logical operations, like in the example below.
50949// or (and (truncate x, truncate y)),
50950// (xor (truncate z, build_vector (constants)))
50951// Given a target type \p VT, we generate
50952// or (and x, y), (xor z, zext(build_vector (constants)))
50953// given x, y and z are of type \p VT. We can do so, if operands are either
50954// truncates from VT types, the second operand is a vector of constants, can
50955// be recursively promoted or is an existing extension we can extend further.
50957 SelectionDAG &DAG,
50958 const X86Subtarget &Subtarget,
50959 unsigned Depth) {
50960 // Limit recursion to avoid excessive compile times.
50962 return SDValue();
50963
50964 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
50965 return SDValue();
50966
50967 SDValue N0 = N.getOperand(0);
50968 SDValue N1 = N.getOperand(1);
50969
50970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50971 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
50972 return SDValue();
50973
50974 if (SDValue NN0 =
50975 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
50976 N0 = NN0;
50977 else {
50978 // The left side has to be a 'trunc'.
50979 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
50980 N0.getOperand(0).getValueType() == VT;
50981 if (LHSTrunc)
50982 N0 = N0.getOperand(0);
50983 else
50984 return SDValue();
50985 }
50986
50987 if (SDValue NN1 =
50988 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
50989 N1 = NN1;
50990 else {
50991 // The right side has to be a 'trunc', a (foldable) constant or an
50992 // existing extension we can extend further.
50993 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
50994 N1.getOperand(0).getValueType() == VT;
50995 if (RHSTrunc)
50996 N1 = N1.getOperand(0);
50997 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
50998 Subtarget.hasInt256() && N1.hasOneUse())
50999 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51000 else if (SDValue Cst =
51002 N1 = Cst;
51003 else
51004 return SDValue();
51005 }
51006
51007 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51008}
51009
51010// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51011// register. In most cases we actually compare or select YMM-sized registers
51012// and mixing the two types creates horrible code. This method optimizes
51013// some of the transition sequences.
51014// Even with AVX-512 this is still useful for removing casts around logical
51015// operations on vXi1 mask types.
51017 SelectionDAG &DAG,
51018 const X86Subtarget &Subtarget) {
51019 EVT VT = N.getValueType();
51020 assert(VT.isVector() && "Expected vector type");
51021 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51022 N.getOpcode() == ISD::ZERO_EXTEND ||
51023 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51024
51025 SDValue Narrow = N.getOperand(0);
51026 EVT NarrowVT = Narrow.getValueType();
51027
51028 // Generate the wide operation.
51029 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51030 if (!Op)
51031 return SDValue();
51032 switch (N.getOpcode()) {
51033 default: llvm_unreachable("Unexpected opcode");
51034 case ISD::ANY_EXTEND:
51035 return Op;
51036 case ISD::ZERO_EXTEND:
51037 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51038 case ISD::SIGN_EXTEND:
51039 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51040 Op, DAG.getValueType(NarrowVT));
51041 }
51042}
51043
51044static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51045 unsigned FPOpcode;
51046 switch (Opcode) {
51047 // clang-format off
51048 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51049 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51050 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51051 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51052 // clang-format on
51053 }
51054 return FPOpcode;
51055}
51056
51057/// If both input operands of a logic op are being cast from floating-point
51058/// types or FP compares, try to convert this into a floating-point logic node
51059/// to avoid unnecessary moves from SSE to integer registers.
51060static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51061 SDValue N0, SDValue N1,
51062 SelectionDAG &DAG,
51064 const X86Subtarget &Subtarget) {
51065 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51066 "Unexpected bit opcode");
51067
51068 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51069 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51070 return SDValue();
51071
51072 SDValue N00 = N0.getOperand(0);
51073 SDValue N10 = N1.getOperand(0);
51074 EVT N00Type = N00.getValueType();
51075 EVT N10Type = N10.getValueType();
51076
51077 // Ensure that both types are the same and are legal scalar fp types.
51078 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51079 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51080 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51081 return SDValue();
51082
51083 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51084 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51085 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51086 return DAG.getBitcast(VT, FPLogic);
51087 }
51088
51089 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51090 !N1.hasOneUse())
51091 return SDValue();
51092
51093 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51094 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51095
51096 // The vector ISA for FP predicates is incomplete before AVX, so converting
51097 // COMIS* to CMPS* may not be a win before AVX.
51098 if (!Subtarget.hasAVX() &&
51099 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51100 return SDValue();
51101
51102 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51103 // and vector logic:
51104 // logic (setcc N00, N01), (setcc N10, N11) -->
51105 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51106 unsigned NumElts = 128 / N00Type.getSizeInBits();
51107 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51108 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51109 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51110 SDValue N01 = N0.getOperand(1);
51111 SDValue N11 = N1.getOperand(1);
51112 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51113 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51114 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51115 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51116 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51117 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51118 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51119 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51120}
51121
51122// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51123// to reduce XMM->GPR traffic.
51124static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51125 SDValue N1, SelectionDAG &DAG) {
51126 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51127 "Unexpected bit opcode");
51128
51129 // Both operands must be single use MOVMSK.
51130 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51131 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51132 return SDValue();
51133
51134 SDValue Vec0 = N0.getOperand(0);
51135 SDValue Vec1 = N1.getOperand(0);
51136 EVT VecVT0 = Vec0.getValueType();
51137 EVT VecVT1 = Vec1.getValueType();
51138
51139 // Both MOVMSK operands must be from vectors of the same size and same element
51140 // size, but its OK for a fp/int diff.
51141 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51142 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51143 return SDValue();
51144
51145 unsigned VecOpc =
51147 SDValue Result =
51148 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51149 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51150}
51151
51152// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51153// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51154// handles in InstCombine.
51155static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51156 SDValue N0, SDValue N1,
51157 SelectionDAG &DAG) {
51158 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51159 "Unexpected bit opcode");
51160
51161 // Both operands must be single use.
51162 if (!N0.hasOneUse() || !N1.hasOneUse())
51163 return SDValue();
51164
51165 // Search for matching shifts.
51168
51169 unsigned BCOpc = BC0.getOpcode();
51170 EVT BCVT = BC0.getValueType();
51171 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51172 return SDValue();
51173
51174 switch (BCOpc) {
51175 case X86ISD::VSHLI:
51176 case X86ISD::VSRLI:
51177 case X86ISD::VSRAI: {
51178 if (BC0.getOperand(1) != BC1.getOperand(1))
51179 return SDValue();
51180 SDValue BitOp =
51181 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51182 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51183 return DAG.getBitcast(VT, Shift);
51184 }
51185 }
51186
51187 return SDValue();
51188}
51189
51190// Attempt to fold:
51191// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51192// TODO: Handle PACKUS handling.
51193static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51194 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51195 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51196 "Unexpected bit opcode");
51197
51198 // Both operands must be single use.
51199 if (!N0.hasOneUse() || !N1.hasOneUse())
51200 return SDValue();
51201
51202 // Search for matching packs.
51205
51206 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51207 return SDValue();
51208
51209 MVT DstVT = N0.getSimpleValueType();
51210 if (DstVT != N1.getSimpleValueType())
51211 return SDValue();
51212
51213 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51214 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51215
51216 // Limit to allsignbits packing.
51217 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51218 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51219 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51220 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51221 return SDValue();
51222
51223 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51224 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51225 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51226}
51227
51228/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51229/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51230/// with a shift-right to eliminate loading the vector constant mask value.
51232 SelectionDAG &DAG,
51233 const X86Subtarget &Subtarget) {
51234 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51235 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51236 EVT VT = Op0.getValueType();
51237 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51238 return SDValue();
51239
51240 // Try to convert an "is positive" signbit masking operation into arithmetic
51241 // shift and "andn". This saves a materialization of a -1 vector constant.
51242 // The "is negative" variant should be handled more generally because it only
51243 // requires "and" rather than "andn":
51244 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51245 //
51246 // This is limited to the original type to avoid producing even more bitcasts.
51247 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51248 // will be profitable.
51249 if (N->getValueType(0) == VT &&
51250 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51251 SDValue X, Y;
51252 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51253 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51254 X = Op1.getOperand(0);
51255 Y = Op0;
51256 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51257 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51258 X = Op0.getOperand(0);
51259 Y = Op1;
51260 }
51261 if (X && Y) {
51262 SDValue Sra =
51264 VT.getScalarSizeInBits() - 1, DAG);
51265 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51266 }
51267 }
51268
51269 APInt SplatVal;
51270 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51271 return SDValue();
51272
51273 // Don't prevent creation of ANDN.
51274 if (isBitwiseNot(Op0))
51275 return SDValue();
51276
51277 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51278 return SDValue();
51279
51280 unsigned EltBitWidth = VT.getScalarSizeInBits();
51281 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51282 return SDValue();
51283
51284 unsigned ShiftVal = SplatVal.countr_one();
51285 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51286 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51287 return DAG.getBitcast(N->getValueType(0), Shift);
51288}
51289
51290// Get the index node from the lowered DAG of a GEP IR instruction with one
51291// indexing dimension.
51293 if (Ld->isIndexed())
51294 return SDValue();
51295
51296 SDValue Base = Ld->getBasePtr();
51297 if (Base.getOpcode() != ISD::ADD)
51298 return SDValue();
51299
51300 SDValue ShiftedIndex = Base.getOperand(0);
51301 if (ShiftedIndex.getOpcode() != ISD::SHL)
51302 return SDValue();
51303
51304 return ShiftedIndex.getOperand(0);
51305}
51306
51307static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51308 return Subtarget.hasBMI2() &&
51309 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51310}
51311
51312/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51313/// This undoes the inverse fold performed in InstCombine
51315 SelectionDAG &DAG) {
51316 using namespace llvm::SDPatternMatch;
51317 MVT VT = N->getSimpleValueType(0);
51318 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51319 return SDValue();
51320
51321 SDValue X, Y, Z;
51322 if (sd_match(N, m_And(m_Value(X),
51323 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51324 // Don't fold if Y or Z are constants to prevent infinite loops.
51327 return DAG.getNode(
51328 ISD::AND, DL, VT, X,
51329 DAG.getNOT(
51330 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51331 }
51332
51333 return SDValue();
51334}
51335
51336// This function recognizes cases where X86 bzhi instruction can replace and
51337// 'and-load' sequence.
51338// In case of loading integer value from an array of constants which is defined
51339// as follows:
51340//
51341// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51342//
51343// then applying a bitwise and on the result with another input.
51344// It's equivalent to performing bzhi (zero high bits) on the input, with the
51345// same index of the load.
51347 const X86Subtarget &Subtarget) {
51348 MVT VT = Node->getSimpleValueType(0);
51349 SDLoc dl(Node);
51350
51351 // Check if subtarget has BZHI instruction for the node's type
51352 if (!hasBZHI(Subtarget, VT))
51353 return SDValue();
51354
51355 // Try matching the pattern for both operands.
51356 for (unsigned i = 0; i < 2; i++) {
51357 // continue if the operand is not a load instruction
51358 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51359 if (!Ld)
51360 continue;
51361 const Value *MemOp = Ld->getMemOperand()->getValue();
51362 if (!MemOp)
51363 continue;
51364 // Get the Node which indexes into the array.
51366 if (!Index)
51367 continue;
51368
51369 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51370 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51371 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51372 Constant *Init = GV->getInitializer();
51373 Type *Ty = Init->getType();
51375 !Ty->getArrayElementType()->isIntegerTy() ||
51376 Ty->getArrayElementType()->getScalarSizeInBits() !=
51377 VT.getSizeInBits() ||
51378 Ty->getArrayNumElements() >
51379 Ty->getArrayElementType()->getScalarSizeInBits())
51380 continue;
51381
51382 // Check if the array's constant elements are suitable to our case.
51383 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51384 bool ConstantsMatch = true;
51385 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51386 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51387 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51388 ConstantsMatch = false;
51389 break;
51390 }
51391 }
51392 if (!ConstantsMatch)
51393 continue;
51394
51395 // Do the transformation (For 32-bit type):
51396 // -> (and (load arr[idx]), inp)
51397 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51398 // that will be replaced with one bzhi instruction.
51399 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51400 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51401
51402 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51403 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51404 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51405
51406 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51407 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51408 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51409 }
51410 }
51411 }
51412 }
51413 return SDValue();
51414}
51415
51416// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51417// Where C is a mask containing the same number of bits as the setcc and
51418// where the setcc will freely 0 upper bits of k-register. We can replace the
51419// undef in the concat with 0s and remove the AND. This mainly helps with
51420// v2i1/v4i1 setcc being casted to scalar.
51422 const X86Subtarget &Subtarget) {
51423 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51424
51425 EVT VT = N->getValueType(0);
51426
51427 // Make sure this is an AND with constant. We will check the value of the
51428 // constant later.
51429 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51430 if (!C1)
51431 return SDValue();
51432
51433 // This is implied by the ConstantSDNode.
51434 assert(!VT.isVector() && "Expected scalar VT!");
51435
51436 SDValue Src = N->getOperand(0);
51437 if (!Src.hasOneUse())
51438 return SDValue();
51439
51440 // (Optionally) peek through any_extend().
51441 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51442 if (!Src.getOperand(0).hasOneUse())
51443 return SDValue();
51444 Src = Src.getOperand(0);
51445 }
51446
51447 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51448 return SDValue();
51449
51450 Src = Src.getOperand(0);
51451 EVT SrcVT = Src.getValueType();
51452
51453 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51454 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51455 !TLI.isTypeLegal(SrcVT))
51456 return SDValue();
51457
51458 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51459 return SDValue();
51460
51461 // We only care about the first subvector of the concat, we expect the
51462 // other subvectors to be ignored due to the AND if we make the change.
51463 SDValue SubVec = Src.getOperand(0);
51464 EVT SubVecVT = SubVec.getValueType();
51465
51466 // The RHS of the AND should be a mask with as many bits as SubVec.
51467 if (!TLI.isTypeLegal(SubVecVT) ||
51468 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51469 return SDValue();
51470
51471 // First subvector should be a setcc with a legal result type or a
51472 // AND containing at least one setcc with a legal result type.
51473 auto IsLegalSetCC = [&](SDValue V) {
51474 if (V.getOpcode() != ISD::SETCC)
51475 return false;
51476 EVT SetccVT = V.getOperand(0).getValueType();
51477 if (!TLI.isTypeLegal(SetccVT) ||
51478 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51479 return false;
51480 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51481 return false;
51482 return true;
51483 };
51484 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51485 (IsLegalSetCC(SubVec.getOperand(0)) ||
51486 IsLegalSetCC(SubVec.getOperand(1))))))
51487 return SDValue();
51488
51489 // We passed all the checks. Rebuild the concat_vectors with zeroes
51490 // and cast it back to VT.
51491 SDLoc dl(N);
51492 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51493 DAG.getConstant(0, dl, SubVecVT));
51494 Ops[0] = SubVec;
51495 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51496 Ops);
51497 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51498 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51499}
51500
51502 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51503 // We don't want to go crazy with the recursion here. This isn't a super
51504 // important optimization.
51505 static constexpr unsigned kMaxDepth = 2;
51506
51507 // Only do this re-ordering if op has one use.
51508 if (!Op.hasOneUse())
51509 return SDValue();
51510
51511 SDLoc DL(Op);
51512 // If we hit another assosiative op, recurse further.
51513 if (Op.getOpcode() == Opc) {
51514 // Done recursing.
51515 if (Depth++ >= kMaxDepth)
51516 return SDValue();
51517
51518 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51519 if (SDValue R =
51520 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51521 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51522 Op.getOperand(1 - OpIdx));
51523
51524 } else if (Op.getOpcode() == ISD::SUB) {
51525 if (Opc == ISD::AND) {
51526 // BLSI: (and x, (sub 0, x))
51527 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51528 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51529 }
51530 // Opc must be ISD::AND or ISD::XOR
51531 // BLSR: (and x, (sub x, 1))
51532 // BLSMSK: (xor x, (sub x, 1))
51533 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51534 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51535
51536 } else if (Op.getOpcode() == ISD::ADD) {
51537 // Opc must be ISD::AND or ISD::XOR
51538 // BLSR: (and x, (add x, -1))
51539 // BLSMSK: (xor x, (add x, -1))
51540 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51541 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51542 }
51543 return SDValue();
51544}
51545
51547 const X86Subtarget &Subtarget) {
51548 EVT VT = N->getValueType(0);
51549 // Make sure this node is a candidate for BMI instructions.
51550 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51551 (VT != MVT::i32 && VT != MVT::i64))
51552 return SDValue();
51553
51554 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51555
51556 // Try and match LHS and RHS.
51557 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51558 if (SDValue OpMatch =
51559 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51560 N->getOperand(1 - OpIdx), 0))
51561 return OpMatch;
51562 return SDValue();
51563}
51564
51565/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51567 SelectionDAG &DAG,
51568 const X86Subtarget &Subtarget) {
51569 using namespace llvm::SDPatternMatch;
51570
51571 EVT VT = And->getValueType(0);
51572 // Make sure this node is a candidate for BMI instructions.
51573 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51574 return SDValue();
51575
51576 SDValue X;
51577 SDValue Y;
51580 m_Value(Y))))
51581 return SDValue();
51582
51583 SDValue BLSMSK =
51584 DAG.getNode(ISD::XOR, DL, VT, X,
51585 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51586 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51587 return AndN;
51588}
51589
51591 SelectionDAG &DAG,
51593 const X86Subtarget &ST) {
51594 // cmp(setcc(cc, X), 0)
51595 // brcond ne
51596 // ->
51597 // X
51598 // brcond cc
51599
51600 // sub(setcc(cc, X), 1)
51601 // brcond ne
51602 // ->
51603 // X
51604 // brcond ~cc
51605 //
51606 // if only flag has users
51607
51608 SDValue SetCC = N->getOperand(0);
51609
51610 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51611 return SDValue();
51612
51613 // Check the only user of flag is `brcond ne`.
51614 SDNode *BrCond = *Flag->user_begin();
51615 if (BrCond->getOpcode() != X86ISD::BRCOND)
51616 return SDValue();
51617 unsigned CondNo = 2;
51618 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51620 return SDValue();
51621
51622 SDValue X = SetCC.getOperand(1);
51623 // sub has two results while X only have one. DAG combine assumes the value
51624 // type matches.
51625 if (N->getOpcode() == X86ISD::SUB)
51626 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51627
51628 SDValue CCN = SetCC.getOperand(0);
51629 X86::CondCode CC =
51630 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51632 // Update CC for the consumer of the flag.
51633 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51634 // checking if the second condition evaluates to true. When comparing the
51635 // result with 1, we are checking uf the second condition evaluates to false.
51637 if (isNullConstant(N->getOperand(1)))
51638 Ops[CondNo] = CCN;
51639 else if (isOneConstant(N->getOperand(1)))
51640 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51641 else
51642 llvm_unreachable("expect constant 0 or 1");
51643
51644 SDValue NewBrCond =
51645 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51646 // Avoid self-assign error b/c CC1 can be `e/ne`.
51647 if (BrCond != NewBrCond.getNode())
51648 DCI.CombineTo(BrCond, NewBrCond);
51649 return X;
51650}
51651
51654 const X86Subtarget &ST) {
51655 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51656 // ->
51657 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51658
51659 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51660 // ->
51661 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51662 //
51663 // where cflags is determined by cc1.
51664
51665 if (!ST.hasCCMP())
51666 return SDValue();
51667
51668 SDValue SetCC0 = N->getOperand(0);
51669 SDValue SetCC1 = N->getOperand(1);
51670 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51671 SetCC1.getOpcode() != X86ISD::SETCC)
51672 return SDValue();
51673
51674 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51675 SDValue Op = V.getOperand(1);
51676 unsigned Opc = Op.getOpcode();
51677 if (Opc == X86ISD::SUB)
51678 return X86ISD::CCMP;
51679 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51680 return X86ISD::CTEST;
51681 return 0U;
51682 };
51683
51684 unsigned NewOpc = 0;
51685
51686 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51687 // appear on the right.
51688 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51689 std::swap(SetCC0, SetCC1);
51690 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51691 return SDValue();
51692 }
51693
51694 X86::CondCode CC0 =
51695 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51696 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51697 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51698 return SDValue();
51699
51700 bool IsOR = N->getOpcode() == ISD::OR;
51701
51702 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51703 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51704 // operator is OR. Similar for CC1.
51705 SDValue SrcCC =
51707 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51708 : SetCC0.getOperand(0);
51709 SDValue CC1N = SetCC1.getOperand(0);
51710 X86::CondCode CC1 =
51711 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51713 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51714 SDLoc DL(N);
51715 SDValue CFlags = DAG.getTargetConstant(
51716 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51717 SDValue Sub = SetCC1.getOperand(1);
51718
51719 // Replace any uses of the old flag produced by SUB/CMP with the new one
51720 // produced by CCMP/CTEST.
51721 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51722 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51723 {Sub.getOperand(0), Sub.getOperand(1),
51724 CFlags, SrcCC, SetCC0.getOperand(1)})
51725 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51726 {Sub.getOperand(0), Sub.getOperand(0),
51727 CFlags, SrcCC, SetCC0.getOperand(1)});
51728
51729 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51730}
51731
51734 const X86Subtarget &Subtarget) {
51735 using namespace SDPatternMatch;
51736
51737 SDValue N0 = N->getOperand(0);
51738 SDValue N1 = N->getOperand(1);
51739 EVT VT = N->getValueType(0);
51740 SDLoc dl(N);
51741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51742
51743 // If this is SSE1 only convert to FAND to avoid scalarization.
51744 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51745 return DAG.getBitcast(MVT::v4i32,
51746 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51747 DAG.getBitcast(MVT::v4f32, N0),
51748 DAG.getBitcast(MVT::v4f32, N1)));
51749 }
51750
51751 // Use a 32-bit and+zext if upper bits known zero.
51752 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51753 APInt HiMask = APInt::getHighBitsSet(64, 32);
51754 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51755 DAG.MaskedValueIsZero(N0, HiMask)) {
51756 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51757 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51758 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51759 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51760 }
51761 }
51762
51763 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51764 // TODO: Support multiple SrcOps.
51765 if (VT == MVT::i1) {
51767 SmallVector<APInt, 2> SrcPartials;
51768 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51769 SrcOps.size() == 1) {
51770 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51771 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51772 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51773 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51774 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51775 if (Mask) {
51776 assert(SrcPartials[0].getBitWidth() == NumElts &&
51777 "Unexpected partial reduction mask");
51778 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51779 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51780 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51781 }
51782 }
51783 }
51784
51785 // InstCombine converts:
51786 // `(-x << C0) & C1`
51787 // to
51788 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51789 // This saves an IR instruction but on x86 the neg/shift version is preferable
51790 // so undo the transform.
51791
51792 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51793 // TODO: We don't actually need a splat for this, we just need the checks to
51794 // hold for each element.
51795 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51796 /*AllowTruncation*/ false);
51797 ConstantSDNode *N01C =
51798 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51799 /*AllowTruncation*/ false);
51800 if (N1C && N01C) {
51801 const APInt &MulC = N01C->getAPIntValue();
51802 const APInt &AndC = N1C->getAPIntValue();
51803 APInt MulCLowBit = MulC & (-MulC);
51804 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51805 (MulCLowBit + MulC).isPowerOf2()) {
51806 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51807 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51808 assert(MulCLowBitLog != -1 &&
51809 "Isolated lowbit is somehow not a power of 2!");
51810 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51811 DAG.getConstant(MulCLowBitLog, dl, VT));
51812 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51813 }
51814 }
51815 }
51816
51817 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51818 return SetCC;
51819
51820 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51821 return V;
51822
51823 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51824 return R;
51825
51826 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51827 return R;
51828
51829 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51830 return R;
51831
51832 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51833 DAG, DCI, Subtarget))
51834 return FPLogic;
51835
51836 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51837 return R;
51838
51839 if (DCI.isBeforeLegalizeOps())
51840 return SDValue();
51841
51842 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51843 return R;
51844
51845 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51846 return R;
51847
51848 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51849 return ShiftRight;
51850
51851 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51852 return R;
51853
51854 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51855 return R;
51856
51857 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51858 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51859 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51860 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51861 unsigned Opc0 = N0.getOpcode();
51862 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51864 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51865 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51866 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51867 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51868 }
51869 }
51870
51871 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51872 // to make use of predicated selects.
51873 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51874 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51875 SDValue X, Y;
51876 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51877 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51878 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51879 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51882 m_Value(Y), m_SpecificVT(CondVT),
51883 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51884 return DAG.getSelect(dl, VT, Y, X,
51885 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51886 }
51887 }
51888
51889 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51890 // avoids slow variable shift (moving shift amount to ECX etc.)
51891 if (isOneConstant(N1) && N0->hasOneUse()) {
51892 SDValue Src = N0;
51893 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51894 Src.getOpcode() == ISD::TRUNCATE) &&
51895 Src.getOperand(0)->hasOneUse())
51896 Src = Src.getOperand(0);
51897 bool ContainsNOT = false;
51898 X86::CondCode X86CC = X86::COND_B;
51899 // Peek through AND(NOT(SRL(X,Y)),1).
51900 if (isBitwiseNot(Src)) {
51901 Src = Src.getOperand(0);
51902 X86CC = X86::COND_AE;
51903 ContainsNOT = true;
51904 }
51905 if (Src.getOpcode() == ISD::SRL &&
51906 !isa<ConstantSDNode>(Src.getOperand(1))) {
51907 SDValue BitNo = Src.getOperand(1);
51908 Src = Src.getOperand(0);
51909 // Peek through AND(SRL(NOT(X),Y),1).
51910 if (isBitwiseNot(Src)) {
51911 Src = Src.getOperand(0);
51912 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51913 ContainsNOT = true;
51914 }
51915 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51916 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51917 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51918 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51919 }
51920 }
51921
51922 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51923 // Attempt to recursively combine a bitmask AND with shuffles.
51924 SDValue Op(N, 0);
51925 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51926 return Res;
51927
51928 // If either operand is a constant mask, then only the elements that aren't
51929 // zero are actually demanded by the other operand.
51930 auto GetDemandedMasks = [&](SDValue Op) {
51931 APInt UndefElts;
51932 SmallVector<APInt> EltBits;
51933 int NumElts = VT.getVectorNumElements();
51934 int EltSizeInBits = VT.getScalarSizeInBits();
51935 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51936 APInt DemandedElts = APInt::getAllOnes(NumElts);
51937 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51938 EltBits)) {
51939 DemandedBits.clearAllBits();
51940 DemandedElts.clearAllBits();
51941 for (int I = 0; I != NumElts; ++I) {
51942 if (UndefElts[I]) {
51943 // We can't assume an undef src element gives an undef dst - the
51944 // other src might be zero.
51945 DemandedBits.setAllBits();
51946 DemandedElts.setBit(I);
51947 } else if (!EltBits[I].isZero()) {
51948 DemandedBits |= EltBits[I];
51949 DemandedElts.setBit(I);
51950 }
51951 }
51952 }
51953 return std::make_pair(DemandedBits, DemandedElts);
51954 };
51955 APInt Bits0, Elts0;
51956 APInt Bits1, Elts1;
51957 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
51958 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
51959
51960 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
51961 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
51962 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
51963 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
51964 if (N->getOpcode() != ISD::DELETED_NODE)
51965 DCI.AddToWorklist(N);
51966 return SDValue(N, 0);
51967 }
51968
51969 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
51970 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
51971 if (NewN0 || NewN1)
51972 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
51973 NewN1 ? NewN1 : N1);
51974 }
51975
51976 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
51977 if ((VT.getScalarSizeInBits() % 8) == 0 &&
51979 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
51980 SDValue BitMask = N1;
51981 SDValue SrcVec = N0.getOperand(0);
51982 EVT SrcVecVT = SrcVec.getValueType();
51983
51984 // Check that the constant bitmask masks whole bytes.
51985 APInt UndefElts;
51986 SmallVector<APInt, 64> EltBits;
51987 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
51988 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
51989 llvm::all_of(EltBits, [](const APInt &M) {
51990 return M.isZero() || M.isAllOnes();
51991 })) {
51992 unsigned NumElts = SrcVecVT.getVectorNumElements();
51993 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
51994 unsigned Idx = N0.getConstantOperandVal(1);
51995
51996 // Create a root shuffle mask from the byte mask and the extracted index.
51997 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
51998 for (unsigned i = 0; i != Scale; ++i) {
51999 if (UndefElts[i])
52000 continue;
52001 int VecIdx = Scale * Idx + i;
52002 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52003 }
52004
52006 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52007 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52008 /*AllowVariableCrossLaneMask=*/true,
52009 /*AllowVariablePerLaneMask=*/true,
52010 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52011 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52012 N0.getOperand(1));
52013 }
52014 }
52015
52016 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52017 return R;
52018
52019 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52020 return R;
52021
52022 return SDValue();
52023}
52024
52025// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52027 SelectionDAG &DAG,
52028 const X86Subtarget &Subtarget) {
52029 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52030
52031 MVT VT = N->getSimpleValueType(0);
52032 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52033 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52034 return SDValue();
52035
52036 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52037 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52038 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52039 return SDValue();
52040
52041 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52042 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52043 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52044 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52045 return SDValue();
52046
52047 // Attempt to extract constant byte masks.
52048 APInt UndefElts0, UndefElts1;
52049 SmallVector<APInt, 32> EltBits0, EltBits1;
52050 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52051 /*AllowWholeUndefs*/ false,
52052 /*AllowPartialUndefs*/ false))
52053 return SDValue();
52054 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52055 /*AllowWholeUndefs*/ false,
52056 /*AllowPartialUndefs*/ false))
52057 return SDValue();
52058
52059 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52060 // TODO - add UNDEF elts support.
52061 if (UndefElts0[i] || UndefElts1[i])
52062 return SDValue();
52063 if (EltBits0[i] != ~EltBits1[i])
52064 return SDValue();
52065 }
52066
52067 if (useVPTERNLOG(Subtarget, VT)) {
52068 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52069 // VPTERNLOG is only available as vXi32/64-bit types.
52070 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52071 MVT OpVT =
52072 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52073 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52074 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52075 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52076 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52077 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52078 DAG, Subtarget);
52079 return DAG.getBitcast(VT, Res);
52080 }
52081
52082 SDValue X = N->getOperand(0);
52083 SDValue Y =
52084 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52085 DAG.getBitcast(VT, N1.getOperand(0)));
52086 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52087}
52088
52089// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52090// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52091// Waiting for ANDNP combine allows other combines to happen that prevent
52092// matching.
52093static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52094 using namespace SDPatternMatch;
52095 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52096 m_And(m_Deferred(Mask), m_Value(Y))));
52097}
52098
52099// Try to fold:
52100// (or (and (m, y), (pandn m, x)))
52101// into:
52102// (vselect m, x, y)
52103// As a special case, try to fold:
52104// (or (and (m, (sub 0, x)), (pandn m, x)))
52105// into:
52106// (sub (xor X, M), M)
52108 SelectionDAG &DAG,
52109 const X86Subtarget &Subtarget) {
52110 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52111
52112 EVT VT = N->getValueType(0);
52113 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52114 (VT.is256BitVector() && Subtarget.hasInt256())))
52115 return SDValue();
52116
52117 SDValue X, Y, Mask;
52118 if (!matchLogicBlend(N, X, Y, Mask))
52119 return SDValue();
52120
52121 // Validate that X, Y, and Mask are bitcasts, and see through them.
52122 Mask = peekThroughBitcasts(Mask);
52125
52126 EVT MaskVT = Mask.getValueType();
52127 unsigned EltBits = MaskVT.getScalarSizeInBits();
52128
52129 // TODO: Attempt to handle floating point cases as well?
52130 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52131 return SDValue();
52132
52133 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52134 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52135 DAG, Subtarget))
52136 return Res;
52137
52138 // PBLENDVB is only available on SSE 4.1.
52139 if (!Subtarget.hasSSE41())
52140 return SDValue();
52141
52142 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52143 if (Subtarget.hasVLX())
52144 return SDValue();
52145
52146 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52147
52148 X = DAG.getBitcast(BlendVT, X);
52149 Y = DAG.getBitcast(BlendVT, Y);
52150 Mask = DAG.getBitcast(BlendVT, Mask);
52151 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52152 return DAG.getBitcast(VT, Mask);
52153}
52154
52155// Helper function for combineOrCmpEqZeroToCtlzSrl
52156// Transforms:
52157// seteq(cmp x, 0)
52158// into:
52159// srl(ctlz x), log2(bitsize(x))
52160// Input pattern is checked by caller.
52162 SDValue Cmp = Op.getOperand(1);
52163 EVT VT = Cmp.getOperand(0).getValueType();
52164 unsigned Log2b = Log2_32(VT.getSizeInBits());
52165 SDLoc dl(Op);
52166 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52167 // The result of the shift is true or false, and on X86, the 32-bit
52168 // encoding of shr and lzcnt is more desirable.
52169 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52170 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52171 DAG.getConstant(Log2b, dl, MVT::i8));
52172 return Scc;
52173}
52174
52175// Try to transform:
52176// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52177// into:
52178// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52179// Will also attempt to match more generic cases, eg:
52180// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52181// Only applies if the target supports the FastLZCNT feature.
52184 const X86Subtarget &Subtarget) {
52185 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52186 return SDValue();
52187
52188 auto isORCandidate = [](SDValue N) {
52189 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52190 };
52191
52192 // Check the zero extend is extending to 32-bit or more. The code generated by
52193 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52194 // instructions to clear the upper bits.
52195 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52196 !isORCandidate(N->getOperand(0)))
52197 return SDValue();
52198
52199 // Check the node matches: setcc(eq, cmp 0)
52200 auto isSetCCCandidate = [](SDValue N) {
52201 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52202 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52203 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52204 isNullConstant(N->getOperand(1).getOperand(1)) &&
52205 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52206 };
52207
52208 SDNode *OR = N->getOperand(0).getNode();
52209 SDValue LHS = OR->getOperand(0);
52210 SDValue RHS = OR->getOperand(1);
52211
52212 // Save nodes matching or(or, setcc(eq, cmp 0)).
52214 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52215 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52216 ORNodes.push_back(OR);
52217 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52218 LHS = OR->getOperand(0);
52219 RHS = OR->getOperand(1);
52220 }
52221
52222 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52223 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52224 !isORCandidate(SDValue(OR, 0)))
52225 return SDValue();
52226
52227 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52228 // to
52229 // or(srl(ctlz),srl(ctlz)).
52230 // The dag combiner can then fold it into:
52231 // srl(or(ctlz, ctlz)).
52232 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52233 SDValue Ret, NewRHS;
52234 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52235 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52236
52237 if (!Ret)
52238 return SDValue();
52239
52240 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52241 while (!ORNodes.empty()) {
52242 OR = ORNodes.pop_back_val();
52243 LHS = OR->getOperand(0);
52244 RHS = OR->getOperand(1);
52245 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52246 if (RHS->getOpcode() == ISD::OR)
52247 std::swap(LHS, RHS);
52248 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52249 if (!NewRHS)
52250 return SDValue();
52251 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52252 }
52253
52254 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52255}
52256
52257/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52258/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52259/// with CMP+{ADC, SBB}.
52260/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52261static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52262 SDValue X, SDValue Y,
52263 SelectionDAG &DAG,
52264 bool ZeroSecondOpOnly = false) {
52265 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52266 return SDValue();
52267
52268 // Look through a one-use zext.
52269 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52270 Y = Y.getOperand(0);
52271
52272 X86::CondCode CC;
52273 SDValue EFLAGS;
52274 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52275 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52276 EFLAGS = Y.getOperand(1);
52277 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52278 Y.hasOneUse()) {
52279 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52280 }
52281
52282 if (!EFLAGS)
52283 return SDValue();
52284
52285 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52286 // the general case below.
52287 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52288 if (ConstantX && !ZeroSecondOpOnly) {
52289 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52290 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52291 // This is a complicated way to get -1 or 0 from the carry flag:
52292 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52293 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52294 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52295 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52296 EFLAGS);
52297 }
52298
52299 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52300 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52301 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52302 EFLAGS.getValueType().isInteger() &&
52303 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52304 // Swap the operands of a SUB, and we have the same pattern as above.
52305 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52306 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52307 SDValue NewSub = DAG.getNode(
52308 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52309 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52310 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52311 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52312 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52313 NewEFLAGS);
52314 }
52315 }
52316 }
52317
52318 if (CC == X86::COND_B) {
52319 // X + SETB Z --> adc X, 0
52320 // X - SETB Z --> sbb X, 0
52321 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52322 DAG.getVTList(VT, MVT::i32), X,
52323 DAG.getConstant(0, DL, VT), EFLAGS);
52324 }
52325
52326 if (ZeroSecondOpOnly)
52327 return SDValue();
52328
52329 if (CC == X86::COND_A) {
52330 // Try to convert COND_A into COND_B in an attempt to facilitate
52331 // materializing "setb reg".
52332 //
52333 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52334 // cannot take an immediate as its first operand.
52335 //
52336 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52337 EFLAGS.getValueType().isInteger() &&
52338 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52339 SDValue NewSub =
52340 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52341 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52342 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52343 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52344 DAG.getVTList(VT, MVT::i32), X,
52345 DAG.getConstant(0, DL, VT), NewEFLAGS);
52346 }
52347 }
52348
52349 if (CC == X86::COND_AE) {
52350 // X + SETAE --> sbb X, -1
52351 // X - SETAE --> adc X, -1
52352 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52353 DAG.getVTList(VT, MVT::i32), X,
52354 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52355 }
52356
52357 if (CC == X86::COND_BE) {
52358 // X + SETBE --> sbb X, -1
52359 // X - SETBE --> adc X, -1
52360 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52361 // materializing "setae reg".
52362 //
52363 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52364 // cannot take an immediate as its first operand.
52365 //
52366 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52367 EFLAGS.getValueType().isInteger() &&
52368 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52369 SDValue NewSub =
52370 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52371 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52372 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52373 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52374 DAG.getVTList(VT, MVT::i32), X,
52375 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52376 }
52377 }
52378
52379 if (CC != X86::COND_E && CC != X86::COND_NE)
52380 return SDValue();
52381
52382 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52383 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52384 !EFLAGS.getOperand(0).getValueType().isInteger())
52385 return SDValue();
52386
52387 SDValue Z = EFLAGS.getOperand(0);
52388 EVT ZVT = Z.getValueType();
52389
52390 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52391 // the general case below.
52392 if (ConstantX) {
52393 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52394 // fake operands:
52395 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52396 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52397 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52398 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52399 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52400 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52401 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52402 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52403 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52404 SDValue(Neg.getNode(), 1));
52405 }
52406
52407 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52408 // with fake operands:
52409 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52410 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52411 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52412 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52413 SDValue One = DAG.getConstant(1, DL, ZVT);
52414 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52415 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52416 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52417 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52418 Cmp1.getValue(1));
52419 }
52420 }
52421
52422 // (cmp Z, 1) sets the carry flag if Z is 0.
52423 SDValue One = DAG.getConstant(1, DL, ZVT);
52424 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52425 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52426
52427 // Add the flags type for ADC/SBB nodes.
52428 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52429
52430 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52431 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52432 if (CC == X86::COND_NE)
52433 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52434 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52435
52436 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52437 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52438 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52439 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52440}
52441
52442/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52443/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52444/// with CMP+{ADC, SBB}.
52446 SelectionDAG &DAG) {
52447 bool IsSub = N->getOpcode() == ISD::SUB;
52448 SDValue X = N->getOperand(0);
52449 SDValue Y = N->getOperand(1);
52450 EVT VT = N->getValueType(0);
52451
52452 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52453 return ADCOrSBB;
52454
52455 // Commute and try again (negate the result for subtracts).
52456 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52457 if (IsSub)
52458 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52459 return ADCOrSBB;
52460 }
52461
52462 return SDValue();
52463}
52464
52465static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52466 SDValue N0, SDValue N1,
52467 SelectionDAG &DAG) {
52468 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52469
52470 // Delegate to combineAddOrSubToADCOrSBB if we have:
52471 //
52472 // (xor/or (zero_extend (setcc)) imm)
52473 //
52474 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52475 // equivalent to a SUB/ADD, respectively.
52476 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52477 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52478 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52479 bool IsSub = Opc == ISD::XOR;
52480 bool N1COdd = N1C->getZExtValue() & 1;
52481 if (IsSub ? N1COdd : !N1COdd)
52482 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52483 return R;
52484 }
52485 }
52486
52487 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52488 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52489 N0.getOperand(0).getOpcode() == ISD::AND &&
52493 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52494 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52495 N0.getOperand(0).getOperand(1));
52496 }
52497
52498 return SDValue();
52499}
52500
52503 const X86Subtarget &Subtarget) {
52504 SDValue N0 = N->getOperand(0);
52505 SDValue N1 = N->getOperand(1);
52506 EVT VT = N->getValueType(0);
52507 SDLoc dl(N);
52508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52509
52510 // If this is SSE1 only convert to FOR to avoid scalarization.
52511 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52512 return DAG.getBitcast(MVT::v4i32,
52513 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52514 DAG.getBitcast(MVT::v4f32, N0),
52515 DAG.getBitcast(MVT::v4f32, N1)));
52516 }
52517
52518 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52519 // TODO: Support multiple SrcOps.
52520 if (VT == MVT::i1) {
52522 SmallVector<APInt, 2> SrcPartials;
52523 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52524 SrcOps.size() == 1) {
52525 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52526 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52527 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52528 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52529 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52530 if (Mask) {
52531 assert(SrcPartials[0].getBitWidth() == NumElts &&
52532 "Unexpected partial reduction mask");
52533 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52534 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52535 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52536 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52537 }
52538 }
52539 }
52540
52541 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52542 return SetCC;
52543
52544 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52545 return R;
52546
52547 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52548 return R;
52549
52550 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52551 return R;
52552
52553 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52554 DAG, DCI, Subtarget))
52555 return FPLogic;
52556
52557 if (DCI.isBeforeLegalizeOps())
52558 return SDValue();
52559
52560 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52561 return R;
52562
52563 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52564 return R;
52565
52566 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52567 return R;
52568
52569 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52570 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52571 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52572 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52573 uint64_t Val = CN->getZExtValue();
52574 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52575 Val == 8) {
52576 SDValue NotCond;
52577 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52578 N0.getOperand(1).hasOneUse()) {
52581 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52582 } else if (N0.getOpcode() == ISD::SUB &&
52583 isNullConstant(N0.getOperand(0))) {
52584 SDValue Cond = N0.getOperand(1);
52585 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52586 Cond = Cond.getOperand(0);
52587 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52588 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52590 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52591 }
52592 }
52593
52594 if (NotCond) {
52595 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52596 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52597 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52598 return R;
52599 }
52600 }
52601 }
52602 }
52603
52604 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52605 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52606 // iff the upper elements of the non-shifted arg are zero.
52607 // KUNPCK require 16+ bool vector elements.
52608 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52609 unsigned NumElts = VT.getVectorNumElements();
52610 unsigned HalfElts = NumElts / 2;
52611 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52612 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52613 N1.getConstantOperandAPInt(1) == HalfElts &&
52614 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52615 return DAG.getNode(
52616 ISD::CONCAT_VECTORS, dl, VT,
52617 extractSubVector(N0, 0, DAG, dl, HalfElts),
52618 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52619 }
52620 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52621 N0.getConstantOperandAPInt(1) == HalfElts &&
52622 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52623 return DAG.getNode(
52624 ISD::CONCAT_VECTORS, dl, VT,
52625 extractSubVector(N1, 0, DAG, dl, HalfElts),
52626 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52627 }
52628 }
52629
52630 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52631 // Attempt to recursively combine an OR of shuffles.
52632 SDValue Op(N, 0);
52633 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52634 return Res;
52635
52636 // If either operand is a constant mask, then only the elements that aren't
52637 // allones are actually demanded by the other operand.
52638 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52639 APInt UndefElts;
52640 SmallVector<APInt> EltBits;
52641 int NumElts = VT.getVectorNumElements();
52642 int EltSizeInBits = VT.getScalarSizeInBits();
52643 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52644 return false;
52645
52646 APInt DemandedElts = APInt::getZero(NumElts);
52647 for (int I = 0; I != NumElts; ++I)
52648 if (!EltBits[I].isAllOnes())
52649 DemandedElts.setBit(I);
52650
52651 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52652 };
52653 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52654 if (N->getOpcode() != ISD::DELETED_NODE)
52655 DCI.AddToWorklist(N);
52656 return SDValue(N, 0);
52657 }
52658 }
52659
52660 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52661 return R;
52662
52663 return SDValue();
52664}
52665
52666/// Try to turn tests against the signbit in the form of:
52667/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52668/// into:
52669/// SETGT(X, -1)
52671 SelectionDAG &DAG) {
52672 // This is only worth doing if the output type is i8 or i1.
52673 EVT ResultType = N->getValueType(0);
52674 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52675 return SDValue();
52676
52677 SDValue N0 = N->getOperand(0);
52678 SDValue N1 = N->getOperand(1);
52679
52680 // We should be performing an xor against a truncated shift.
52681 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52682 return SDValue();
52683
52684 // Make sure we are performing an xor against one.
52685 if (!isOneConstant(N1))
52686 return SDValue();
52687
52688 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52689 SDValue Shift = N0.getOperand(0);
52690 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52691 return SDValue();
52692
52693 // Make sure we are truncating from one of i16, i32 or i64.
52694 EVT ShiftTy = Shift.getValueType();
52695 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52696 return SDValue();
52697
52698 // Make sure the shift amount extracts the sign bit.
52699 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52700 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52701 return SDValue();
52702
52703 // Create a greater-than comparison against -1.
52704 // N.B. Using SETGE against 0 works but we want a canonical looking
52705 // comparison, using SETGT matches up with what TranslateX86CC.
52706 SDValue ShiftOp = Shift.getOperand(0);
52707 EVT ShiftOpTy = ShiftOp.getValueType();
52708 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52709 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52710 *DAG.getContext(), ResultType);
52711 SDValue Cond =
52712 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52713 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52714 if (SetCCResultType != ResultType)
52715 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52716 return Cond;
52717}
52718
52719/// Turn vector tests of the signbit in the form of:
52720/// xor (sra X, elt_size(X)-1), -1
52721/// into:
52722/// pcmpgt X, -1
52723///
52724/// This should be called before type legalization because the pattern may not
52725/// persist after that.
52727 const X86Subtarget &Subtarget) {
52728 EVT VT = N->getValueType(0);
52729 if (!VT.isSimple())
52730 return SDValue();
52731
52732 switch (VT.getSimpleVT().SimpleTy) {
52733 // clang-format off
52734 default: return SDValue();
52735 case MVT::v16i8:
52736 case MVT::v8i16:
52737 case MVT::v4i32:
52738 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52739 case MVT::v32i8:
52740 case MVT::v16i16:
52741 case MVT::v8i32:
52742 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52743 // clang-format on
52744 }
52745
52746 // There must be a shift right algebraic before the xor, and the xor must be a
52747 // 'not' operation.
52748 SDValue Shift = N->getOperand(0);
52749 SDValue Ones = N->getOperand(1);
52750 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52752 return SDValue();
52753
52754 // The shift should be smearing the sign bit across each vector element.
52755 auto *ShiftAmt =
52756 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52757 if (!ShiftAmt ||
52758 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52759 return SDValue();
52760
52761 // Create a greater-than comparison against -1. We don't use the more obvious
52762 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52763 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52764}
52765
52766/// Detect patterns of truncation with unsigned saturation:
52767///
52768/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52769/// Return the source value x to be truncated or SDValue() if the pattern was
52770/// not matched.
52771///
52772/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52773/// where C1 >= 0 and C2 is unsigned max of destination type.
52774///
52775/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52776/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52777///
52778/// These two patterns are equivalent to:
52779/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52780/// So return the smax(x, C1) value to be truncated or SDValue() if the
52781/// pattern was not matched.
52783 const SDLoc &DL) {
52784 using namespace llvm::SDPatternMatch;
52785 EVT InVT = In.getValueType();
52786
52787 // Saturation with truncation. We truncate from InVT to VT.
52789 "Unexpected types for truncate operation");
52790
52791 APInt C1, C2;
52793
52794 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52795 // the element size of the destination type.
52796 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52797 C2.isMask(VT.getScalarSizeInBits()))
52798 return UMin;
52799
52800 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52802 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52803 return SMin;
52804
52805 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52807 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52808 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52809
52810 return SDValue();
52811}
52812
52813/// Detect patterns of truncation with signed saturation:
52814/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52815/// signed_max_of_dest_type)) to dest_type)
52816/// or:
52817/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52818/// signed_min_of_dest_type)) to dest_type).
52819/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52820/// Return the source value to be truncated or SDValue() if the pattern was not
52821/// matched.
52822static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52823 using namespace llvm::SDPatternMatch;
52824 unsigned NumDstBits = VT.getScalarSizeInBits();
52825 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52826 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52827
52828 APInt SignedMax, SignedMin;
52829 if (MatchPackUS) {
52830 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52831 SignedMin = APInt::getZero(NumSrcBits);
52832 } else {
52833 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52834 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52835 }
52836
52837 SDValue SMin, SMax;
52838 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52839 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52840 return SMax;
52841
52842 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52843 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52844 return SMin;
52845
52846 return SDValue();
52847}
52848
52850 SelectionDAG &DAG,
52851 const X86Subtarget &Subtarget) {
52852 if (!Subtarget.hasSSE2() || !VT.isVector())
52853 return SDValue();
52854
52855 EVT SVT = VT.getVectorElementType();
52856 EVT InVT = In.getValueType();
52857 EVT InSVT = InVT.getVectorElementType();
52858
52859 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52860 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52861 // and concatenate at the same time. Then we can use a final vpmovuswb to
52862 // clip to 0-255.
52863 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52864 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52865 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52866 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52867 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52868 DL, DAG, Subtarget);
52869 assert(Mid && "Failed to pack!");
52870 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52871 }
52872 }
52873
52874 // vXi32 truncate instructions are available with AVX512F.
52875 // vXi16 truncate instructions are only available with AVX512BW.
52876 // For 256-bit or smaller vectors, we require VLX.
52877 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52878 // If the result type is 256-bits or larger and we have disable 512-bit
52879 // registers, we should go ahead and use the pack instructions if possible.
52880 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52881 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52882 (InVT.getSizeInBits() > 128) &&
52883 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52884 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52885
52886 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52888 (SVT == MVT::i8 || SVT == MVT::i16) &&
52889 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52890 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52891 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52892 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52893 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52894 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52895 DAG, Subtarget);
52896 assert(Mid && "Failed to pack!");
52898 Subtarget);
52899 assert(V && "Failed to pack!");
52900 return V;
52901 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52902 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52903 Subtarget);
52904 }
52905 if (SDValue SSatVal = detectSSatPattern(In, VT))
52906 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52907 Subtarget);
52908 }
52909
52910 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52911 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52912 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52913 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52914 unsigned TruncOpc = 0;
52915 SDValue SatVal;
52916 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52917 SatVal = SSatVal;
52918 TruncOpc = X86ISD::VTRUNCS;
52919 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52920 SatVal = USatVal;
52921 TruncOpc = X86ISD::VTRUNCUS;
52922 }
52923 if (SatVal) {
52924 unsigned ResElts = VT.getVectorNumElements();
52925 // If the input type is less than 512 bits and we don't have VLX, we need
52926 // to widen to 512 bits.
52927 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52928 unsigned NumConcats = 512 / InVT.getSizeInBits();
52929 ResElts *= NumConcats;
52930 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
52931 ConcatOps[0] = SatVal;
52932 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
52933 NumConcats * InVT.getVectorNumElements());
52934 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
52935 }
52936 // Widen the result if its narrower than 128 bits.
52937 if (ResElts * SVT.getSizeInBits() < 128)
52938 ResElts = 128 / SVT.getSizeInBits();
52939 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
52940 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
52941 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
52942 DAG.getVectorIdxConstant(0, DL));
52943 }
52944 }
52945
52946 return SDValue();
52947}
52948
52950 SelectionDAG &DAG,
52952 const X86Subtarget &Subtarget) {
52953 auto *Ld = cast<LoadSDNode>(N);
52954 EVT RegVT = Ld->getValueType(0);
52955 SDValue Ptr = Ld->getBasePtr();
52956 SDValue Chain = Ld->getChain();
52957 ISD::LoadExtType Ext = Ld->getExtensionType();
52958
52959 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
52960 return SDValue();
52961
52962 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
52963 return SDValue();
52964
52966 if (!LdC)
52967 return SDValue();
52968
52969 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
52970 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
52971 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
52972 if (Undefs[I])
52973 continue;
52974 if (UserUndefs[I] || Bits[I] != UserBits[I])
52975 return false;
52976 }
52977 return true;
52978 };
52979
52980 // Look through all other loads/broadcasts in the chain for another constant
52981 // pool entry.
52982 for (SDNode *User : Chain->users()) {
52983 auto *UserLd = dyn_cast<MemSDNode>(User);
52984 if (User != N && UserLd &&
52985 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
52986 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
52988 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
52989 User->getValueSizeInBits(0).getFixedValue() >
52990 RegVT.getFixedSizeInBits()) {
52991 EVT UserVT = User->getValueType(0);
52992 SDValue UserPtr = UserLd->getBasePtr();
52993 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
52994
52995 // See if we are loading a constant that matches in the lower
52996 // bits of a longer constant (but from a different constant pool ptr).
52997 if (UserC && UserPtr != Ptr) {
52998 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
52999 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53000 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53001 APInt Undefs, UserUndefs;
53002 SmallVector<APInt> Bits, UserBits;
53003 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53004 UserVT.getScalarSizeInBits());
53005 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53006 Bits) &&
53008 UserUndefs, UserBits)) {
53009 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53011 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53012 RegVT.getSizeInBits());
53013 Extract = DAG.getBitcast(RegVT, Extract);
53014 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53015 }
53016 }
53017 }
53018 }
53019 }
53020 }
53021
53022 return SDValue();
53023}
53024
53027 const X86Subtarget &Subtarget) {
53028 auto *Ld = cast<LoadSDNode>(N);
53029 EVT RegVT = Ld->getValueType(0);
53030 EVT MemVT = Ld->getMemoryVT();
53031 SDLoc dl(Ld);
53032 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53033
53034 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53035 // into two 16-byte operations. Also split non-temporal aligned loads on
53036 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53037 ISD::LoadExtType Ext = Ld->getExtensionType();
53038 unsigned Fast;
53039 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53040 Ext == ISD::NON_EXTLOAD &&
53041 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53042 Ld->getAlign() >= Align(16)) ||
53043 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53044 *Ld->getMemOperand(), &Fast) &&
53045 !Fast))) {
53046 unsigned NumElems = RegVT.getVectorNumElements();
53047 if (NumElems < 2)
53048 return SDValue();
53049
53050 unsigned HalfOffset = 16;
53051 SDValue Ptr1 = Ld->getBasePtr();
53052 SDValue Ptr2 =
53053 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53054 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53055 NumElems / 2);
53056 SDValue Load1 =
53057 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53058 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53059 SDValue Load2 =
53060 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53061 Ld->getPointerInfo().getWithOffset(HalfOffset),
53062 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53063 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53064 Load1.getValue(1), Load2.getValue(1));
53065
53066 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53067 return DCI.CombineTo(N, NewVec, TF, true);
53068 }
53069
53070 // Bool vector load - attempt to cast to an integer, as we have good
53071 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53072 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53073 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53074 unsigned NumElts = RegVT.getVectorNumElements();
53075 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53076 if (TLI.isTypeLegal(IntVT)) {
53077 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53078 Ld->getPointerInfo(), Ld->getBaseAlign(),
53079 Ld->getMemOperand()->getFlags());
53080 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53081 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53082 }
53083 }
53084
53085 // If we also broadcast this vector to a wider type, then just extract the
53086 // lowest subvector.
53087 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53088 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53089 SDValue Ptr = Ld->getBasePtr();
53090 SDValue Chain = Ld->getChain();
53091 for (SDNode *User : Chain->users()) {
53092 auto *UserLd = dyn_cast<MemSDNode>(User);
53093 if (User != N && UserLd &&
53094 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53095 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53096 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53097 User->hasAnyUseOfValue(0) &&
53098 User->getValueSizeInBits(0).getFixedValue() >
53099 RegVT.getFixedSizeInBits()) {
53101 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53102 RegVT.getSizeInBits());
53103 Extract = DAG.getBitcast(RegVT, Extract);
53104 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53105 }
53106 }
53107 }
53108
53109 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53110 return V;
53111
53112 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53113 unsigned AddrSpace = Ld->getAddressSpace();
53114 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53115 AddrSpace == X86AS::PTR32_UPTR) {
53116 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53117 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53118 SDValue Cast =
53119 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53120 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53121 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53122 Ld->getMemOperand()->getFlags());
53123 }
53124 }
53125
53126 return SDValue();
53127}
53128
53129/// If V is a build vector of boolean constants and exactly one of those
53130/// constants is true, return the operand index of that true element.
53131/// Otherwise, return -1.
53132static int getOneTrueElt(SDValue V) {
53133 // This needs to be a build vector of booleans.
53134 // TODO: Checking for the i1 type matches the IR definition for the mask,
53135 // but the mask check could be loosened to i8 or other types. That might
53136 // also require checking more than 'allOnesValue'; eg, the x86 HW
53137 // instructions only require that the MSB is set for each mask element.
53138 // The ISD::MSTORE comments/definition do not specify how the mask operand
53139 // is formatted.
53140 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53141 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53142 return -1;
53143
53144 int TrueIndex = -1;
53145 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53146 for (unsigned i = 0; i < NumElts; ++i) {
53147 const SDValue &Op = BV->getOperand(i);
53148 if (Op.isUndef())
53149 continue;
53150 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53151 if (!ConstNode)
53152 return -1;
53153 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53154 // If we already found a one, this is too many.
53155 if (TrueIndex >= 0)
53156 return -1;
53157 TrueIndex = i;
53158 }
53159 }
53160 return TrueIndex;
53161}
53162
53163/// Given a masked memory load/store operation, return true if it has one mask
53164/// bit set. If it has one mask bit set, then also return the memory address of
53165/// the scalar element to load/store, the vector index to insert/extract that
53166/// scalar element, and the alignment for the scalar memory access.
53168 SelectionDAG &DAG, SDValue &Addr,
53169 SDValue &Index, Align &Alignment,
53170 unsigned &Offset) {
53171 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53172 if (TrueMaskElt < 0)
53173 return false;
53174
53175 // Get the address of the one scalar element that is specified by the mask
53176 // using the appropriate offset from the base pointer.
53177 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53178 Offset = 0;
53179 Addr = MaskedOp->getBasePtr();
53180 if (TrueMaskElt != 0) {
53181 Offset = TrueMaskElt * EltVT.getStoreSize();
53183 SDLoc(MaskedOp));
53184 }
53185
53186 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53187 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53188 return true;
53189}
53190
53191/// If exactly one element of the mask is set for a non-extending masked load,
53192/// it is a scalar load and vector insert.
53193/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53194/// mask have already been optimized in IR, so we don't bother with those here.
53195static SDValue
53198 const X86Subtarget &Subtarget) {
53199 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53200 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53201 // However, some target hooks may need to be added to know when the transform
53202 // is profitable. Endianness would also have to be considered.
53203
53204 SDValue Addr, VecIndex;
53205 Align Alignment;
53206 unsigned Offset;
53207 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53208 return SDValue();
53209
53210 // Load the one scalar element that is specified by the mask using the
53211 // appropriate offset from the base pointer.
53212 SDLoc DL(ML);
53213 EVT VT = ML->getValueType(0);
53214 EVT EltVT = VT.getVectorElementType();
53215
53216 EVT CastVT = VT;
53217 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53218 EltVT = MVT::f64;
53219 CastVT = VT.changeVectorElementType(EltVT);
53220 }
53221
53222 SDValue Load =
53223 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53224 ML->getPointerInfo().getWithOffset(Offset),
53225 Alignment, ML->getMemOperand()->getFlags());
53226
53227 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53228
53229 // Insert the loaded element into the appropriate place in the vector.
53230 SDValue Insert =
53231 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53232 Insert = DAG.getBitcast(VT, Insert);
53233 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53234}
53235
53236static SDValue
53239 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53240 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53241 return SDValue();
53242
53243 SDLoc DL(ML);
53244 EVT VT = ML->getValueType(0);
53245
53246 // If we are loading the first and last elements of a vector, it is safe and
53247 // always faster to load the whole vector. Replace the masked load with a
53248 // vector load and select.
53249 unsigned NumElts = VT.getVectorNumElements();
53250 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53251 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53252 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53253 if (LoadFirstElt && LoadLastElt) {
53254 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53255 ML->getMemOperand());
53256 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53257 ML->getPassThru());
53258 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53259 }
53260
53261 // Convert a masked load with a constant mask into a masked load and a select.
53262 // This allows the select operation to use a faster kind of select instruction
53263 // (for example, vblendvps -> vblendps).
53264
53265 // Don't try this if the pass-through operand is already undefined. That would
53266 // cause an infinite loop because that's what we're about to create.
53267 if (ML->getPassThru().isUndef())
53268 return SDValue();
53269
53270 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53271 return SDValue();
53272
53273 // The new masked load has an undef pass-through operand. The select uses the
53274 // original pass-through operand.
53275 SDValue NewML = DAG.getMaskedLoad(
53276 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53277 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53278 ML->getAddressingMode(), ML->getExtensionType());
53279 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53280 ML->getPassThru());
53281
53282 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53283}
53284
53287 const X86Subtarget &Subtarget) {
53288 auto *Mld = cast<MaskedLoadSDNode>(N);
53289
53290 // TODO: Expanding load with constant mask may be optimized as well.
53291 if (Mld->isExpandingLoad())
53292 return SDValue();
53293
53294 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53295 if (SDValue ScalarLoad =
53296 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53297 return ScalarLoad;
53298
53299 // TODO: Do some AVX512 subsets benefit from this transform?
53300 if (!Subtarget.hasAVX512())
53301 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53302 return Blend;
53303 }
53304
53305 // If the mask value has been legalized to a non-boolean vector, try to
53306 // simplify ops leading up to it. We only demand the MSB of each lane.
53307 SDValue Mask = Mld->getMask();
53308 if (Mask.getScalarValueSizeInBits() != 1) {
53309 EVT VT = Mld->getValueType(0);
53310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53312 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53313 if (N->getOpcode() != ISD::DELETED_NODE)
53314 DCI.AddToWorklist(N);
53315 return SDValue(N, 0);
53316 }
53317 if (SDValue NewMask =
53319 return DAG.getMaskedLoad(
53320 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53321 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53322 Mld->getAddressingMode(), Mld->getExtensionType());
53323 }
53324
53325 return SDValue();
53326}
53327
53328/// If exactly one element of the mask is set for a non-truncating masked store,
53329/// it is a vector extract and scalar store.
53330/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53331/// mask have already been optimized in IR, so we don't bother with those here.
53333 SelectionDAG &DAG,
53334 const X86Subtarget &Subtarget) {
53335 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53336 // However, some target hooks may need to be added to know when the transform
53337 // is profitable. Endianness would also have to be considered.
53338
53339 SDValue Addr, VecIndex;
53340 Align Alignment;
53341 unsigned Offset;
53342 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53343 return SDValue();
53344
53345 // Extract the one scalar element that is actually being stored.
53346 SDLoc DL(MS);
53347 SDValue Value = MS->getValue();
53348 EVT VT = Value.getValueType();
53349 EVT EltVT = VT.getVectorElementType();
53350 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53351 EltVT = MVT::f64;
53352 EVT CastVT = VT.changeVectorElementType(EltVT);
53353 Value = DAG.getBitcast(CastVT, Value);
53354 }
53355 SDValue Extract =
53356 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53357
53358 // Store that element at the appropriate offset from the base pointer.
53359 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53361 Alignment, MS->getMemOperand()->getFlags());
53362}
53363
53366 const X86Subtarget &Subtarget) {
53368 if (Mst->isCompressingStore())
53369 return SDValue();
53370
53371 EVT VT = Mst->getValue().getValueType();
53372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53373
53374 if (Mst->isTruncatingStore())
53375 return SDValue();
53376
53377 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53378 return ScalarStore;
53379
53380 // If the mask value has been legalized to a non-boolean vector, try to
53381 // simplify ops leading up to it. We only demand the MSB of each lane.
53382 SDValue Mask = Mst->getMask();
53383 if (Mask.getScalarValueSizeInBits() != 1) {
53385 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53386 if (N->getOpcode() != ISD::DELETED_NODE)
53387 DCI.AddToWorklist(N);
53388 return SDValue(N, 0);
53389 }
53390 if (SDValue NewMask =
53392 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53393 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53394 Mst->getMemoryVT(), Mst->getMemOperand(),
53395 Mst->getAddressingMode());
53396 }
53397
53398 SDValue Value = Mst->getValue();
53399 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53400 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53401 Mst->getMemoryVT())) {
53402 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53403 Mst->getBasePtr(), Mst->getOffset(), Mask,
53404 Mst->getMemoryVT(), Mst->getMemOperand(),
53405 Mst->getAddressingMode(), true);
53406 }
53407
53408 return SDValue();
53409}
53410
53413 const X86Subtarget &Subtarget) {
53415 EVT StVT = St->getMemoryVT();
53416 SDLoc dl(St);
53417 SDValue StoredVal = St->getValue();
53418 EVT VT = StoredVal.getValueType();
53419 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53420
53421 // Convert a store of vXi1 into a store of iX and a bitcast.
53422 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53423 VT.getVectorElementType() == MVT::i1) {
53424
53426 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53427
53428 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53429 St->getPointerInfo(), St->getBaseAlign(),
53430 St->getMemOperand()->getFlags());
53431 }
53432
53433 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53434 // This will avoid a copy to k-register.
53435 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53436 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53437 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53438 SDValue Val = StoredVal.getOperand(0);
53439 // We must store zeros to the unused bits.
53440 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53441 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53442 St->getPointerInfo(), St->getBaseAlign(),
53443 St->getMemOperand()->getFlags());
53444 }
53445
53446 // Widen v2i1/v4i1 stores to v8i1.
53447 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53448 Subtarget.hasAVX512()) {
53449 unsigned NumConcats = 8 / VT.getVectorNumElements();
53450 // We must store zeros to the unused bits.
53451 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53452 Ops[0] = StoredVal;
53453 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53454 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53455 St->getPointerInfo(), St->getBaseAlign(),
53456 St->getMemOperand()->getFlags());
53457 }
53458
53459 // Turn vXi1 stores of constants into a scalar store.
53460 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53461 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53463 // If its a v64i1 store without 64-bit support, we need two stores.
53464 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53465 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53466 StoredVal->ops().slice(0, 32));
53468 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53469 StoredVal->ops().slice(32, 32));
53471
53472 SDValue Ptr0 = St->getBasePtr();
53473 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53474
53475 SDValue Ch0 =
53476 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53477 St->getBaseAlign(), St->getMemOperand()->getFlags());
53478 SDValue Ch1 = DAG.getStore(
53479 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53480 St->getBaseAlign(), St->getMemOperand()->getFlags());
53481 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53482 }
53483
53484 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53485 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53486 St->getPointerInfo(), St->getBaseAlign(),
53487 St->getMemOperand()->getFlags());
53488 }
53489
53490 // Convert scalar fabs/fneg load-store to integer equivalents.
53491 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53492 (StoredVal.getOpcode() == ISD::FABS ||
53493 StoredVal.getOpcode() == ISD::FNEG) &&
53494 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53495 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53496 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53497 if (TLI.isTypeLegal(IntVT)) {
53499 unsigned SignOp = ISD::XOR;
53500 if (StoredVal.getOpcode() == ISD::FABS) {
53501 SignMask = ~SignMask;
53502 SignOp = ISD::AND;
53503 }
53504 SDValue LogicOp = DAG.getNode(
53505 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53506 DAG.getConstant(SignMask, dl, IntVT));
53507 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53508 St->getPointerInfo(), St->getBaseAlign(),
53509 St->getMemOperand()->getFlags());
53510 }
53511 }
53512
53513 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53514 // Sandy Bridge, perform two 16-byte stores.
53515 unsigned Fast;
53516 if (VT.is256BitVector() && StVT == VT &&
53517 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53518 *St->getMemOperand(), &Fast) &&
53519 !Fast) {
53520 unsigned NumElems = VT.getVectorNumElements();
53521 if (NumElems < 2)
53522 return SDValue();
53523
53524 return splitVectorStore(St, DAG);
53525 }
53526
53527 // Split under-aligned vector non-temporal stores.
53528 if (St->isNonTemporal() && StVT == VT &&
53529 St->getAlign().value() < VT.getStoreSize()) {
53530 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53531 // vectors or the legalizer can scalarize it to use MOVNTI.
53532 if (VT.is256BitVector() || VT.is512BitVector()) {
53533 unsigned NumElems = VT.getVectorNumElements();
53534 if (NumElems < 2)
53535 return SDValue();
53536 return splitVectorStore(St, DAG);
53537 }
53538
53539 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53540 // to use MOVNTI.
53541 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53542 MVT NTVT = Subtarget.hasSSE4A()
53543 ? MVT::v2f64
53544 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53545 return scalarizeVectorStore(St, NTVT, DAG);
53546 }
53547 }
53548
53549 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53550 // supported, but avx512f is by extending to v16i32 and truncating.
53551 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53552 St->getValue().getOpcode() == ISD::TRUNCATE &&
53553 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53554 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53555 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53556 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53557 St->getValue().getOperand(0));
53558 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53559 MVT::v16i8, St->getMemOperand());
53560 }
53561
53562 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53563 if (!St->isTruncatingStore() &&
53564 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53565 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53566 StoredVal.hasOneUse() &&
53567 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53568 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53569 return EmitTruncSStore(IsSigned, St->getChain(),
53570 dl, StoredVal.getOperand(0), St->getBasePtr(),
53571 VT, St->getMemOperand(), DAG);
53572 }
53573
53574 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53575 if (!St->isTruncatingStore()) {
53576 auto IsExtractedElement = [](SDValue V) {
53577 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53578 V = V.getOperand(0);
53579 unsigned Opc = V.getOpcode();
53581 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53582 V.getOperand(0).hasOneUse())
53583 return V.getOperand(0);
53584 return SDValue();
53585 };
53586 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53587 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53588 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53589 SDValue Src = Trunc.getOperand(0);
53590 MVT DstVT = Trunc.getSimpleValueType();
53591 MVT SrcVT = Src.getSimpleValueType();
53592 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53593 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53594 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53595 if (NumTruncBits == VT.getSizeInBits() &&
53596 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53597 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53598 TruncVT, St->getMemOperand());
53599 }
53600 }
53601 }
53602 }
53603
53604 // Optimize trunc store (of multiple scalars) to shuffle and store.
53605 // First, pack all of the elements in one place. Next, store to memory
53606 // in fewer chunks.
53607 if (St->isTruncatingStore() && VT.isVector()) {
53608 if (TLI.isTruncStoreLegal(VT, StVT)) {
53609 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53610 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53611 dl, Val, St->getBasePtr(),
53612 St->getMemoryVT(), St->getMemOperand(), DAG);
53613 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53614 DAG, dl))
53615 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53616 dl, Val, St->getBasePtr(),
53617 St->getMemoryVT(), St->getMemOperand(), DAG);
53618 }
53619
53620 return SDValue();
53621 }
53622
53623 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53624 unsigned AddrSpace = St->getAddressSpace();
53625 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53626 AddrSpace == X86AS::PTR32_UPTR) {
53627 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53628 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53629 SDValue Cast =
53630 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53631 return DAG.getTruncStore(
53632 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53633 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53634 }
53635 }
53636
53637 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53638 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53639 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53640 Subtarget.hasCF() && St->isSimple()) {
53641 SDValue Cmov;
53642 if (StoredVal.getOpcode() == X86ISD::CMOV)
53643 Cmov = StoredVal;
53644 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53645 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53646 Cmov = StoredVal.getOperand(0);
53647 else
53648 return SDValue();
53649
53650 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53651 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53652 return SDValue();
53653
53654 bool InvertCC = false;
53655 SDValue V = SDValue(Ld, 0);
53656 if (V == Cmov.getOperand(1))
53657 InvertCC = true;
53658 else if (V != Cmov.getOperand(0))
53659 return SDValue();
53660
53661 SDVTList Tys = DAG.getVTList(MVT::Other);
53662 SDValue CC = Cmov.getOperand(2);
53663 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53664 if (InvertCC)
53665 CC = DAG.getTargetConstant(
53668 dl, MVT::i8);
53669 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53670 Cmov.getOperand(3)};
53671 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53672 St->getMemOperand());
53673 }
53674
53675 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53676 // the FP state in cases where an emms may be missing.
53677 // A preferable solution to the general problem is to figure out the right
53678 // places to insert EMMS. This qualifies as a quick hack.
53679
53680 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53681 if (VT.getSizeInBits() != 64)
53682 return SDValue();
53683
53684 const Function &F = DAG.getMachineFunction().getFunction();
53685 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53686 bool F64IsLegal =
53687 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53688
53689 if (!F64IsLegal || Subtarget.is64Bit())
53690 return SDValue();
53691
53692 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53693 cast<LoadSDNode>(St->getValue())->isSimple() &&
53694 St->getChain().hasOneUse() && St->isSimple()) {
53695 auto *Ld = cast<LoadSDNode>(St->getValue());
53696
53697 if (!ISD::isNormalLoad(Ld))
53698 return SDValue();
53699
53700 // Avoid the transformation if there are multiple uses of the loaded value.
53701 if (!Ld->hasNUsesOfValue(1, 0))
53702 return SDValue();
53703
53704 SDLoc LdDL(Ld);
53705 SDLoc StDL(N);
53706
53707 // Remove any range metadata as we're converting to f64 load/store.
53708 Ld->getMemOperand()->clearRanges();
53709
53710 // Lower to a single movq load/store pair.
53711 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53712 Ld->getBasePtr(), Ld->getMemOperand());
53713
53714 // Make sure new load is placed in same chain order.
53715 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53716 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53717 St->getMemOperand());
53718 }
53719
53720 // This is similar to the above case, but here we handle a scalar 64-bit
53721 // integer store that is extracted from a vector on a 32-bit target.
53722 // If we have SSE2, then we can treat it like a floating-point double
53723 // to get past legalization. The execution dependencies fixup pass will
53724 // choose the optimal machine instruction for the store if this really is
53725 // an integer or v2f32 rather than an f64.
53726 if (VT == MVT::i64 &&
53728 SDValue OldExtract = St->getOperand(1);
53729 SDValue ExtOp0 = OldExtract.getOperand(0);
53730 unsigned VecSize = ExtOp0.getValueSizeInBits();
53731 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53732 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53733 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53734 BitCast, OldExtract.getOperand(1));
53735 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53736 St->getPointerInfo(), St->getBaseAlign(),
53737 St->getMemOperand()->getFlags());
53738 }
53739
53740 return SDValue();
53741}
53742
53745 const X86Subtarget &Subtarget) {
53746 auto *St = cast<MemIntrinsicSDNode>(N);
53747
53748 SDValue StoredVal = N->getOperand(1);
53749 MVT VT = StoredVal.getSimpleValueType();
53750 EVT MemVT = St->getMemoryVT();
53751
53752 // Figure out which elements we demand.
53753 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53754 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53755
53756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53757 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53758 if (N->getOpcode() != ISD::DELETED_NODE)
53759 DCI.AddToWorklist(N);
53760 return SDValue(N, 0);
53761 }
53762
53763 return SDValue();
53764}
53765
53766/// Return 'true' if this vector operation is "horizontal"
53767/// and return the operands for the horizontal operation in LHS and RHS. A
53768/// horizontal operation performs the binary operation on successive elements
53769/// of its first operand, then on successive elements of its second operand,
53770/// returning the resulting values in a vector. For example, if
53771/// A = < float a0, float a1, float a2, float a3 >
53772/// and
53773/// B = < float b0, float b1, float b2, float b3 >
53774/// then the result of doing a horizontal operation on A and B is
53775/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53776/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53777/// A horizontal-op B, for some already available A and B, and if so then LHS is
53778/// set to A, RHS to B, and the routine returns 'true'.
53779static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53780 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53781 bool IsCommutative,
53782 SmallVectorImpl<int> &PostShuffleMask,
53783 bool ForceHorizOp) {
53784 // If either operand is undef, bail out. The binop should be simplified.
53785 if (LHS.isUndef() || RHS.isUndef())
53786 return false;
53787
53788 // Look for the following pattern:
53789 // A = < float a0, float a1, float a2, float a3 >
53790 // B = < float b0, float b1, float b2, float b3 >
53791 // and
53792 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53793 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53794 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53795 // which is A horizontal-op B.
53796
53797 MVT VT = LHS.getSimpleValueType();
53798 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53799 "Unsupported vector type for horizontal add/sub");
53800 unsigned NumElts = VT.getVectorNumElements();
53801
53802 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53803 SmallVectorImpl<int> &ShuffleMask) {
53804 bool UseSubVector = false;
53805 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53806 Op.getOperand(0).getValueType().is256BitVector() &&
53807 llvm::isNullConstant(Op.getOperand(1))) {
53808 Op = Op.getOperand(0);
53809 UseSubVector = true;
53810 }
53812 SmallVector<int, 16> SrcMask, ScaledMask;
53814 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53815 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53816 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53817 })) {
53818 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53819 if (!UseSubVector && SrcOps.size() <= 2 &&
53820 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53821 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53822 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53823 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53824 }
53825 if (UseSubVector && SrcOps.size() == 1 &&
53826 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53827 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53828 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53829 ShuffleMask.assign(Mask.begin(), Mask.end());
53830 }
53831 }
53832 };
53833
53834 // View LHS in the form
53835 // LHS = VECTOR_SHUFFLE A, B, LMask
53836 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53837 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53838 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53839 SDValue A, B;
53841 GetShuffle(LHS, A, B, LMask);
53842
53843 // Likewise, view RHS in the form
53844 // RHS = VECTOR_SHUFFLE C, D, RMask
53845 SDValue C, D;
53847 GetShuffle(RHS, C, D, RMask);
53848
53849 // At least one of the operands should be a vector shuffle.
53850 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53851 if (NumShuffles == 0)
53852 return false;
53853
53854 if (LMask.empty()) {
53855 A = LHS;
53856 for (unsigned i = 0; i != NumElts; ++i)
53857 LMask.push_back(i);
53858 }
53859
53860 if (RMask.empty()) {
53861 C = RHS;
53862 for (unsigned i = 0; i != NumElts; ++i)
53863 RMask.push_back(i);
53864 }
53865
53866 // If we have an unary mask, ensure the other op is set to null.
53867 if (isUndefOrInRange(LMask, 0, NumElts))
53868 B = SDValue();
53869 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53870 A = SDValue();
53871
53872 if (isUndefOrInRange(RMask, 0, NumElts))
53873 D = SDValue();
53874 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53875 C = SDValue();
53876
53877 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53878 // RHS operands and shuffle mask.
53879 if (A != C) {
53880 std::swap(C, D);
53882 }
53883 // Check that the shuffles are both shuffling the same vectors.
53884 if (!(A == C && B == D))
53885 return false;
53886
53887 PostShuffleMask.clear();
53888 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53889
53890 // LHS and RHS are now:
53891 // LHS = shuffle A, B, LMask
53892 // RHS = shuffle A, B, RMask
53893 // Check that the masks correspond to performing a horizontal operation.
53894 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53895 // so we just repeat the inner loop if this is a 256-bit op.
53896 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53897 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53898 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53899 assert((NumEltsPer128BitChunk % 2 == 0) &&
53900 "Vector type should have an even number of elements in each lane");
53901 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53902 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53903 // Ignore undefined components.
53904 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53905 if (LIdx < 0 || RIdx < 0 ||
53906 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53907 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53908 continue;
53909
53910 // Check that successive odd/even elements are being operated on. If not,
53911 // this is not a horizontal operation.
53912 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53913 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53914 return false;
53915
53916 // Compute the post-shuffle mask index based on where the element
53917 // is stored in the HOP result, and where it needs to be moved to.
53918 int Base = LIdx & ~1u;
53919 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53920 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53921
53922 // The low half of the 128-bit result must choose from A.
53923 // The high half of the 128-bit result must choose from B,
53924 // unless B is undef. In that case, we are always choosing from A.
53925 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53926 Index += NumEltsPer64BitChunk;
53927 PostShuffleMask[i + j] = Index;
53928 }
53929 }
53930
53931 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
53932 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
53933
53934 bool IsIdentityPostShuffle =
53935 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
53936 if (IsIdentityPostShuffle)
53937 PostShuffleMask.clear();
53938
53939 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
53940 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
53941 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
53942 return false;
53943
53944 // If the source nodes are already used in HorizOps then always accept this.
53945 // Shuffle folding should merge these back together.
53946 auto FoundHorizUser = [&](SDNode *User) {
53947 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
53948 };
53949 ForceHorizOp =
53950 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
53951 llvm::any_of(NewRHS->users(), FoundHorizUser));
53952
53953 // Assume a SingleSource HOP if we only shuffle one input and don't need to
53954 // shuffle the result.
53955 if (!ForceHorizOp &&
53956 !shouldUseHorizontalOp(NewLHS == NewRHS &&
53957 (NumShuffles < 2 || !IsIdentityPostShuffle),
53958 DAG, Subtarget))
53959 return false;
53960
53961 LHS = DAG.getBitcast(VT, NewLHS);
53962 RHS = DAG.getBitcast(VT, NewRHS);
53963 return true;
53964}
53965
53966// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
53968 const X86Subtarget &Subtarget) {
53969 EVT VT = N->getValueType(0);
53970 unsigned Opcode = N->getOpcode();
53971 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
53972 SmallVector<int, 8> PostShuffleMask;
53973
53974 auto MergableHorizOp = [N](unsigned HorizOpcode) {
53975 return N->hasOneUse() &&
53976 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
53977 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
53978 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
53979 };
53980
53981 switch (Opcode) {
53982 case ISD::FADD:
53983 case ISD::FSUB:
53984 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
53985 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
53986 SDValue LHS = N->getOperand(0);
53987 SDValue RHS = N->getOperand(1);
53988 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
53989 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
53990 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
53991 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
53992 if (!PostShuffleMask.empty())
53993 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
53994 DAG.getUNDEF(VT), PostShuffleMask);
53995 return HorizBinOp;
53996 }
53997 }
53998 break;
53999 case ISD::ADD:
54000 case ISD::SUB:
54001 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54002 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54003 SDValue LHS = N->getOperand(0);
54004 SDValue RHS = N->getOperand(1);
54005 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54006 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54007 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54008 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54010 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54011 };
54012 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54013 {LHS, RHS}, HOpBuilder);
54014 if (!PostShuffleMask.empty())
54015 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54016 DAG.getUNDEF(VT), PostShuffleMask);
54017 return HorizBinOp;
54018 }
54019 }
54020 break;
54021 }
54022
54023 return SDValue();
54024}
54025
54026// Try to combine the following nodes
54027// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54028// <i32 -2147483648[float -0.000000e+00]> 0
54029// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54030// <(load 4 from constant-pool)> t0, t29
54031// [t30: v16i32 = bitcast t27]
54032// t6: v16i32 = xor t7, t27[t30]
54033// t11: v16f32 = bitcast t6
54034// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54035// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54036// t22: v16f32 = bitcast t7
54037// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54038// t24: v32f16 = bitcast t23
54040 const X86Subtarget &Subtarget) {
54041 EVT VT = N->getValueType(0);
54042 SDValue LHS = N->getOperand(0);
54043 SDValue RHS = N->getOperand(1);
54044 int CombineOpcode =
54045 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54046 auto combineConjugation = [&](SDValue &r) {
54047 if (LHS->getOpcode() == ISD::BITCAST) {
54048 SDValue XOR = LHS.getOperand(0);
54049 if (XOR->getOpcode() == ISD::XOR) {
54050 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54051 if (XORRHS.isConstant()) {
54052 APInt ConjugationInt32 = APInt(32, 0x80000000);
54053 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54054 if ((XORRHS.getBitWidth() == 32 &&
54055 XORRHS.getConstant() == ConjugationInt32) ||
54056 (XORRHS.getBitWidth() == 64 &&
54057 XORRHS.getConstant() == ConjugationInt64)) {
54058 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54059 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54060 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54061 r = DAG.getBitcast(VT, FCMulC);
54062 return true;
54063 }
54064 }
54065 }
54066 }
54067 return false;
54068 };
54069 SDValue Res;
54070 if (combineConjugation(Res))
54071 return Res;
54072 std::swap(LHS, RHS);
54073 if (combineConjugation(Res))
54074 return Res;
54075 return Res;
54076}
54077
54078// Try to combine the following nodes:
54079// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54081 const X86Subtarget &Subtarget) {
54082 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54084 Flags.hasAllowContract();
54085 };
54086
54087 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54088 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54089 Flags.hasNoSignedZeros();
54090 };
54091 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54092 APInt AI = APInt(32, 0x80008000);
54093 KnownBits Bits = DAG.computeKnownBits(Op);
54094 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54095 Bits.getConstant() == AI;
54096 };
54097
54098 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54099 !AllowContract(N->getFlags()))
54100 return SDValue();
54101
54102 EVT VT = N->getValueType(0);
54103 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54104 return SDValue();
54105
54106 SDValue LHS = N->getOperand(0);
54107 SDValue RHS = N->getOperand(1);
54108 bool IsConj;
54109 SDValue FAddOp1, MulOp0, MulOp1;
54110 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54111 &IsVectorAllNegativeZero,
54112 &HasNoSignedZero](SDValue N) -> bool {
54113 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54114 return false;
54115 SDValue Op0 = N.getOperand(0);
54116 unsigned Opcode = Op0.getOpcode();
54117 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54118 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54119 MulOp0 = Op0.getOperand(0);
54120 MulOp1 = Op0.getOperand(1);
54121 IsConj = Opcode == X86ISD::VFCMULC;
54122 return true;
54123 }
54124 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54126 HasNoSignedZero(Op0->getFlags())) ||
54127 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54128 MulOp0 = Op0.getOperand(0);
54129 MulOp1 = Op0.getOperand(1);
54130 IsConj = Opcode == X86ISD::VFCMADDC;
54131 return true;
54132 }
54133 }
54134 return false;
54135 };
54136
54137 if (GetCFmulFrom(LHS))
54138 FAddOp1 = RHS;
54139 else if (GetCFmulFrom(RHS))
54140 FAddOp1 = LHS;
54141 else
54142 return SDValue();
54143
54144 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54145 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54146 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54147 // FIXME: How do we handle when fast math flags of FADD are different from
54148 // CFMUL's?
54149 SDValue CFmul =
54150 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54151 return DAG.getBitcast(VT, CFmul);
54152}
54153
54154/// Do target-specific dag combines on floating-point adds/subs.
54156 const X86Subtarget &Subtarget) {
54157 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54158 return HOp;
54159
54160 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54161 return COp;
54162
54163 return SDValue();
54164}
54165
54167 const X86Subtarget &Subtarget) {
54168 EVT VT = N->getValueType(0);
54169 SDValue Src = N->getOperand(0);
54170 EVT SrcVT = Src.getValueType();
54171 SDLoc DL(N);
54172
54173 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54174
54175 // Let legalize expand this if it isn't a legal type yet.
54176 if (!TLI.isTypeLegal(VT))
54177 return SDValue();
54178
54179 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54180 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54181 return SDValue();
54182
54183 if (SrcVT == MVT::v2f16) {
54184 SrcVT = MVT::v4f16;
54185 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54186 DAG.getUNDEF(MVT::v2f16));
54187 }
54188
54189 if (SrcVT == MVT::v4f16) {
54190 SrcVT = MVT::v8f16;
54191 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54192 DAG.getUNDEF(MVT::v4f16));
54193 } else if (SrcVT == MVT::v2f32) {
54194 SrcVT = MVT::v4f32;
54195 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54196 DAG.getUNDEF(MVT::v2f32));
54197 } else {
54198 return SDValue();
54199 }
54200
54201 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54202}
54203
54204// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54205// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54206// are able to avoid generating code with MOVABS and large constants in certain
54207// cases.
54209 const SDLoc &DL) {
54210 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54211 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54212 if (!ValidSrlConst)
54213 return SDValue();
54214 unsigned SrlConstVal = *ValidSrlConst;
54215
54216 SDValue Op = N.getOperand(0);
54217 unsigned Opcode = Op.getOpcode();
54218 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54219 "Illegal truncation types");
54220
54221 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54222 !isa<ConstantSDNode>(Op.getOperand(1)))
54223 return SDValue();
54224 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54225
54226 if (SrlConstVal <= 32 ||
54227 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54228 return SDValue();
54229
54230 SDValue OpLhsSrl =
54231 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54232 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54233
54234 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54235 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54236 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54237
54238 if (Opcode == ISD::ADD) {
54239 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54240 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54241 }
54242 return NewOpNode;
54243}
54244
54245/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54246/// the codegen.
54247/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54248/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54249/// anything that is guaranteed to be transformed by DAGCombiner.
54251 const X86Subtarget &Subtarget,
54252 const SDLoc &DL) {
54253 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54254 SDValue Src = N->getOperand(0);
54255 unsigned SrcOpcode = Src.getOpcode();
54256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54257
54258 EVT VT = N->getValueType(0);
54259 EVT SrcVT = Src.getValueType();
54260
54261 auto IsFreeTruncation = [VT](SDValue Op) {
54262 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54263
54264 // See if this has been extended from a smaller/equal size to
54265 // the truncation size, allowing a truncation to combine with the extend.
54266 unsigned Opcode = Op.getOpcode();
54267 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54268 Opcode == ISD::ZERO_EXTEND) &&
54269 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54270 return true;
54271
54272 // See if this is a single use constant which can be constant folded.
54273 // NOTE: We don't peek throught bitcasts here because there is currently
54274 // no support for constant folding truncate+bitcast+vector_of_constants. So
54275 // we'll just send up with a truncate on both operands which will
54276 // get turned back into (truncate (binop)) causing an infinite loop.
54277 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54278 };
54279
54280 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54281 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54282 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54283 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54284 };
54285
54286 // Don't combine if the operation has other uses.
54287 if (!Src.hasOneUse())
54288 return SDValue();
54289
54290 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54291 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54292
54293 if (!VT.isVector())
54294 return SDValue();
54295
54296 // In most cases its only worth pre-truncating if we're only facing the cost
54297 // of one truncation.
54298 // i.e. if one of the inputs will constant fold or the input is repeated.
54299 switch (SrcOpcode) {
54300 case ISD::MUL:
54301 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54302 // better to truncate if we have the chance.
54303 if (SrcVT.getScalarType() == MVT::i64 &&
54304 TLI.isOperationLegal(SrcOpcode, VT) &&
54305 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54306 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54307 [[fallthrough]];
54308 case ISD::AND:
54309 case ISD::XOR:
54310 case ISD::OR:
54311 case ISD::ADD:
54312 case ISD::SUB: {
54313 SDValue Op0 = Src.getOperand(0);
54314 SDValue Op1 = Src.getOperand(1);
54315 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54316 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54317 return TruncateArithmetic(Op0, Op1);
54318 break;
54319 }
54320 }
54321
54322 return SDValue();
54323}
54324
54325// Try to form a MULHU or MULHS node by looking for
54326// (trunc (srl (mul ext, ext), >= 16))
54327// TODO: This is X86 specific because we want to be able to handle wide types
54328// before type legalization. But we can only do it if the vector will be
54329// legalized via widening/splitting. Type legalization can't handle promotion
54330// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54331// combiner.
54332static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54333 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54334 using namespace llvm::SDPatternMatch;
54335
54336 if (!Subtarget.hasSSE2())
54337 return SDValue();
54338
54339 // Only handle vXi16 types that are at least 128-bits unless they will be
54340 // widened.
54341 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54342 return SDValue();
54343
54344 // Input type should be at least vXi32.
54345 EVT InVT = Src.getValueType();
54346 if (InVT.getVectorElementType().getSizeInBits() < 32)
54347 return SDValue();
54348
54349 // First instruction should be a right shift by 16 of a multiply.
54350 SDValue LHS, RHS;
54351 APInt ShiftAmt;
54352 if (!sd_match(Src,
54353 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54354 return SDValue();
54355
54356 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54357 return SDValue();
54358
54359 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54360
54361 // Count leading sign/zero bits on both inputs - if there are enough then
54362 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54363 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54364 // truncations may actually be free by peeking through to the ext source.
54365 auto IsSext = [&DAG](SDValue V) {
54366 return DAG.ComputeMaxSignificantBits(V) <= 16;
54367 };
54368 auto IsZext = [&DAG](SDValue V) {
54369 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54370 };
54371
54372 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54373 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54374 if (!IsSigned && !IsUnsigned)
54375 return SDValue();
54376
54377 // Check if both inputs are extensions, which will be removed by truncation.
54378 auto isOpTruncateFree = [](SDValue Op) {
54379 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54380 Op.getOpcode() == ISD::ZERO_EXTEND)
54381 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54382 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54383 };
54384 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54385
54386 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54387 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54388 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54389 // will have to split anyway.
54390 unsigned InSizeInBits = InVT.getSizeInBits();
54391 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54392 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54393 (InSizeInBits % 16) == 0) {
54394 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54395 InVT.getSizeInBits() / 16);
54396 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54397 DAG.getBitcast(BCVT, RHS));
54398 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54399 return DAG.getNode(ISD::SRL, DL, VT, Res,
54400 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54401 }
54402
54403 // Truncate back to source type.
54404 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54405 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54406
54407 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54408 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54409 return DAG.getNode(ISD::SRL, DL, VT, Res,
54410 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54411}
54412
54413// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54414// from one vector with signed bytes from another vector, adds together
54415// adjacent pairs of 16-bit products, and saturates the result before
54416// truncating to 16-bits.
54417//
54418// Which looks something like this:
54419// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54420// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54422 const X86Subtarget &Subtarget,
54423 const SDLoc &DL) {
54424 if (!VT.isVector() || !Subtarget.hasSSSE3())
54425 return SDValue();
54426
54427 unsigned NumElems = VT.getVectorNumElements();
54428 EVT ScalarVT = VT.getVectorElementType();
54429 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54430 return SDValue();
54431
54432 SDValue SSatVal = detectSSatPattern(In, VT);
54433 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54434 return SDValue();
54435
54436 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54437 // of multiplies from even/odd elements.
54438 SDValue N0 = SSatVal.getOperand(0);
54439 SDValue N1 = SSatVal.getOperand(1);
54440
54441 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54442 return SDValue();
54443
54444 SDValue N00 = N0.getOperand(0);
54445 SDValue N01 = N0.getOperand(1);
54446 SDValue N10 = N1.getOperand(0);
54447 SDValue N11 = N1.getOperand(1);
54448
54449 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54450 // Canonicalize zero_extend to LHS.
54451 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54452 std::swap(N00, N01);
54453 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54454 std::swap(N10, N11);
54455
54456 // Ensure we have a zero_extend and a sign_extend.
54457 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54458 N01.getOpcode() != ISD::SIGN_EXTEND ||
54459 N10.getOpcode() != ISD::ZERO_EXTEND ||
54460 N11.getOpcode() != ISD::SIGN_EXTEND)
54461 return SDValue();
54462
54463 // Peek through the extends.
54464 N00 = N00.getOperand(0);
54465 N01 = N01.getOperand(0);
54466 N10 = N10.getOperand(0);
54467 N11 = N11.getOperand(0);
54468
54469 // Ensure the extend is from vXi8.
54470 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54471 N01.getValueType().getVectorElementType() != MVT::i8 ||
54472 N10.getValueType().getVectorElementType() != MVT::i8 ||
54473 N11.getValueType().getVectorElementType() != MVT::i8)
54474 return SDValue();
54475
54476 // All inputs should be build_vectors.
54477 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54478 N01.getOpcode() != ISD::BUILD_VECTOR ||
54479 N10.getOpcode() != ISD::BUILD_VECTOR ||
54481 return SDValue();
54482
54483 // N00/N10 are zero extended. N01/N11 are sign extended.
54484
54485 // For each element, we need to ensure we have an odd element from one vector
54486 // multiplied by the odd element of another vector and the even element from
54487 // one of the same vectors being multiplied by the even element from the
54488 // other vector. So we need to make sure for each element i, this operator
54489 // is being performed:
54490 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54491 SDValue ZExtIn, SExtIn;
54492 for (unsigned i = 0; i != NumElems; ++i) {
54493 SDValue N00Elt = N00.getOperand(i);
54494 SDValue N01Elt = N01.getOperand(i);
54495 SDValue N10Elt = N10.getOperand(i);
54496 SDValue N11Elt = N11.getOperand(i);
54497 // TODO: Be more tolerant to undefs.
54498 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54499 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54500 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54502 return SDValue();
54503 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54504 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54505 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54506 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54507 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54508 return SDValue();
54509 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54510 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54511 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54512 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54513 // Add is commutative so indices can be reordered.
54514 if (IdxN00 > IdxN10) {
54515 std::swap(IdxN00, IdxN10);
54516 std::swap(IdxN01, IdxN11);
54517 }
54518 // N0 indices be the even element. N1 indices must be the next odd element.
54519 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54520 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54521 return SDValue();
54522 SDValue N00In = N00Elt.getOperand(0);
54523 SDValue N01In = N01Elt.getOperand(0);
54524 SDValue N10In = N10Elt.getOperand(0);
54525 SDValue N11In = N11Elt.getOperand(0);
54526 // First time we find an input capture it.
54527 if (!ZExtIn) {
54528 ZExtIn = N00In;
54529 SExtIn = N01In;
54530 }
54531 if (ZExtIn != N00In || SExtIn != N01In ||
54532 ZExtIn != N10In || SExtIn != N11In)
54533 return SDValue();
54534 }
54535
54536 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54537 EVT ExtVT = Ext.getValueType();
54538 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54539 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54540 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54541 DAG.getVectorIdxConstant(0, DL));
54542 }
54543 };
54544 ExtractVec(ZExtIn);
54545 ExtractVec(SExtIn);
54546
54547 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54549 // Shrink by adding truncate nodes and let DAGCombine fold with the
54550 // sources.
54551 EVT InVT = Ops[0].getValueType();
54552 assert(InVT.getScalarType() == MVT::i8 &&
54553 "Unexpected scalar element type");
54554 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54555 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54556 InVT.getVectorNumElements() / 2);
54557 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54558 };
54559 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54560 PMADDBuilder);
54561}
54562
54564 const X86Subtarget &Subtarget) {
54565 EVT VT = N->getValueType(0);
54566 SDValue Src = N->getOperand(0);
54567 SDLoc DL(N);
54568
54569 // Attempt to pre-truncate inputs to arithmetic ops instead.
54570 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54571 return V;
54572
54573 // Try to detect PMADD
54574 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54575 return PMAdd;
54576
54577 // Try to combine truncation with signed/unsigned saturation.
54578 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54579 return Val;
54580
54581 // Try to combine PMULHUW/PMULHW for vXi16.
54582 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54583 return V;
54584
54585 // The bitcast source is a direct mmx result.
54586 // Detect bitcasts between i32 to x86mmx
54587 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54588 SDValue BCSrc = Src.getOperand(0);
54589 if (BCSrc.getValueType() == MVT::x86mmx)
54590 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54591 }
54592
54593 return SDValue();
54594}
54595
54598 EVT VT = N->getValueType(0);
54599 SDValue In = N->getOperand(0);
54600 SDLoc DL(N);
54601
54602 if (SDValue SSatVal = detectSSatPattern(In, VT))
54603 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54604 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54605 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54606
54607 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54608 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54609 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54610 return SDValue(N, 0);
54611
54612 return SDValue();
54613}
54614
54615/// Returns the negated value if the node \p N flips sign of FP value.
54616///
54617/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54618/// or FSUB(0, x)
54619/// AVX512F does not have FXOR, so FNEG is lowered as
54620/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54621/// In this case we go though all bitcasts.
54622/// This also recognizes splat of a negated value and returns the splat of that
54623/// value.
54624static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54625 if (N->getOpcode() == ISD::FNEG)
54626 return N->getOperand(0);
54627
54628 // Don't recurse exponentially.
54630 return SDValue();
54631
54632 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54633
54635 EVT VT = Op->getValueType(0);
54636
54637 // Make sure the element size doesn't change.
54638 if (VT.getScalarSizeInBits() != ScalarSize)
54639 return SDValue();
54640
54641 unsigned Opc = Op.getOpcode();
54642 switch (Opc) {
54643 case ISD::VECTOR_SHUFFLE: {
54644 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54645 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54646 if (!Op.getOperand(1).isUndef())
54647 return SDValue();
54648 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54649 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54650 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54651 cast<ShuffleVectorSDNode>(Op)->getMask());
54652 break;
54653 }
54655 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54656 // -V, INDEX).
54657 SDValue InsVector = Op.getOperand(0);
54658 SDValue InsVal = Op.getOperand(1);
54659 if (!InsVector.isUndef())
54660 return SDValue();
54661 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54662 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54663 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54664 NegInsVal, Op.getOperand(2));
54665 break;
54666 }
54667 case ISD::FSUB:
54668 case ISD::XOR:
54669 case X86ISD::FXOR: {
54670 SDValue Op1 = Op.getOperand(1);
54671 SDValue Op0 = Op.getOperand(0);
54672
54673 // For XOR and FXOR, we want to check if constant
54674 // bits of Op1 are sign bit masks. For FSUB, we
54675 // have to check if constant bits of Op0 are sign
54676 // bit masks and hence we swap the operands.
54677 if (Opc == ISD::FSUB)
54678 std::swap(Op0, Op1);
54679
54680 APInt UndefElts;
54681 SmallVector<APInt, 16> EltBits;
54682 // Extract constant bits and see if they are all
54683 // sign bit masks. Ignore the undef elements.
54684 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54685 /* AllowWholeUndefs */ true,
54686 /* AllowPartialUndefs */ false)) {
54687 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54688 if (!UndefElts[I] && !EltBits[I].isSignMask())
54689 return SDValue();
54690
54691 // Only allow bitcast from correctly-sized constant.
54692 Op0 = peekThroughBitcasts(Op0);
54693 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54694 return Op0;
54695 }
54696 break;
54697 } // case
54698 } // switch
54699
54700 return SDValue();
54701}
54702
54703static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54704 bool NegRes) {
54705 if (NegMul) {
54706 switch (Opcode) {
54707 // clang-format off
54708 default: llvm_unreachable("Unexpected opcode");
54709 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54710 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54711 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54712 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54713 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54714 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54715 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54716 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54717 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54718 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54719 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54720 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54721 // clang-format on
54722 }
54723 }
54724
54725 if (NegAcc) {
54726 switch (Opcode) {
54727 // clang-format off
54728 default: llvm_unreachable("Unexpected opcode");
54729 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54730 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54731 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54732 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54733 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54734 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54735 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54736 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54737 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54738 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54739 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54740 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54741 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54742 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54743 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54744 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54745 // clang-format on
54746 }
54747 }
54748
54749 if (NegRes) {
54750 switch (Opcode) {
54751 // For accuracy reason, we never combine fneg and fma under strict FP.
54752 // clang-format off
54753 default: llvm_unreachable("Unexpected opcode");
54754 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54755 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54756 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54757 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54758 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54759 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54760 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54761 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54762 // clang-format on
54763 }
54764 }
54765
54766 return Opcode;
54767}
54768
54769/// Do target-specific dag combines on floating point negations.
54772 const X86Subtarget &Subtarget) {
54773 EVT OrigVT = N->getValueType(0);
54774 SDValue Arg = isFNEG(DAG, N);
54775 if (!Arg)
54776 return SDValue();
54777
54778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54779 EVT VT = Arg.getValueType();
54780 EVT SVT = VT.getScalarType();
54781 SDLoc DL(N);
54782
54783 // Let legalize expand this if it isn't a legal type yet.
54784 if (!TLI.isTypeLegal(VT))
54785 return SDValue();
54786
54787 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54788 // use of a constant by performing (-0 - A*B) instead.
54789 // FIXME: Check rounding control flags as well once it becomes available.
54790 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54791 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54792 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54793 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54794 Arg.getOperand(1), Zero);
54795 return DAG.getBitcast(OrigVT, NewNode);
54796 }
54797
54799 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54800 if (SDValue NegArg =
54801 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54802 return DAG.getBitcast(OrigVT, NegArg);
54803
54804 return SDValue();
54805}
54806
54808 bool LegalOperations,
54809 bool ForCodeSize,
54811 unsigned Depth) const {
54812 // fneg patterns are removable even if they have multiple uses.
54813 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54815 return DAG.getBitcast(Op.getValueType(), Arg);
54816 }
54817
54818 EVT VT = Op.getValueType();
54819 EVT SVT = VT.getScalarType();
54820 unsigned Opc = Op.getOpcode();
54821 SDNodeFlags Flags = Op.getNode()->getFlags();
54822 switch (Opc) {
54823 case ISD::FMA:
54824 case X86ISD::FMSUB:
54825 case X86ISD::FNMADD:
54826 case X86ISD::FNMSUB:
54827 case X86ISD::FMADD_RND:
54828 case X86ISD::FMSUB_RND:
54829 case X86ISD::FNMADD_RND:
54830 case X86ISD::FNMSUB_RND: {
54831 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54832 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54834 break;
54835
54836 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54837 // if it may have signed zeros.
54838 if (!Flags.hasNoSignedZeros())
54839 break;
54840
54841 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54842 // keep temporary nodes alive.
54843 std::list<HandleSDNode> Handles;
54844
54845 // This is always negatible for free but we might be able to remove some
54846 // extra operand negations as well.
54847 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54848 for (int i = 0; i != 3; ++i) {
54849 NewOps[i] = getCheaperNegatedExpression(
54850 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54851 if (!!NewOps[i])
54852 Handles.emplace_back(NewOps[i]);
54853 }
54854
54855 bool NegA = !!NewOps[0];
54856 bool NegB = !!NewOps[1];
54857 bool NegC = !!NewOps[2];
54858 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54859
54860 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54862
54863 // Fill in the non-negated ops with the original values.
54864 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54865 if (!NewOps[i])
54866 NewOps[i] = Op.getOperand(i);
54867 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54868 }
54869 case X86ISD::FRCP:
54870 if (SDValue NegOp0 =
54871 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54872 ForCodeSize, Cost, Depth + 1))
54873 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54874 break;
54875 }
54876
54877 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54878 ForCodeSize, Cost, Depth);
54879}
54880
54882 const X86Subtarget &Subtarget) {
54883 MVT VT = N->getSimpleValueType(0);
54884 // If we have integer vector types available, use the integer opcodes.
54885 if (!VT.isVector() || !Subtarget.hasSSE2())
54886 return SDValue();
54887
54888 SDLoc dl(N);
54890 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54891 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54892 unsigned IntOpcode;
54893 switch (N->getOpcode()) {
54894 // clang-format off
54895 default: llvm_unreachable("Unexpected FP logic op");
54896 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54897 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54898 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54899 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54900 // clang-format on
54901 }
54902 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54903 return DAG.getBitcast(VT, IntOp);
54904}
54905
54906/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54908 if (N->getOpcode() != ISD::XOR)
54909 return SDValue();
54910
54911 SDValue LHS = N->getOperand(0);
54912 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54913 return SDValue();
54914
54916 X86::CondCode(LHS->getConstantOperandVal(0)));
54917 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54918}
54919
54921 const X86Subtarget &Subtarget) {
54922 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54923 "Invalid opcode for combing with CTLZ");
54924 if (Subtarget.hasFastLZCNT())
54925 return SDValue();
54926
54927 EVT VT = N->getValueType(0);
54928 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54929 (VT != MVT::i64 || !Subtarget.is64Bit()))
54930 return SDValue();
54931
54932 SDValue N0 = N->getOperand(0);
54933 SDValue N1 = N->getOperand(1);
54934
54935 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
54937 return SDValue();
54938
54939 SDValue OpCTLZ;
54940 SDValue OpSizeTM1;
54941
54942 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
54943 OpCTLZ = N1;
54944 OpSizeTM1 = N0;
54945 } else if (N->getOpcode() == ISD::SUB) {
54946 return SDValue();
54947 } else {
54948 OpCTLZ = N0;
54949 OpSizeTM1 = N1;
54950 }
54951
54952 if (!OpCTLZ.hasOneUse())
54953 return SDValue();
54954 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
54955 if (!C)
54956 return SDValue();
54957
54958 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
54959 return SDValue();
54960 EVT OpVT = VT;
54961 SDValue Op = OpCTLZ.getOperand(0);
54962 if (VT == MVT::i8) {
54963 // Zero extend to i32 since there is not an i8 bsr.
54964 OpVT = MVT::i32;
54965 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
54966 }
54967
54968 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54969 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
54970 if (VT == MVT::i8)
54971 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
54972
54973 return Op;
54974}
54975
54978 const X86Subtarget &Subtarget) {
54979 SDValue N0 = N->getOperand(0);
54980 SDValue N1 = N->getOperand(1);
54981 EVT VT = N->getValueType(0);
54982 SDLoc DL(N);
54983
54984 // If this is SSE1 only convert to FXOR to avoid scalarization.
54985 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
54986 return DAG.getBitcast(MVT::v4i32,
54987 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
54988 DAG.getBitcast(MVT::v4f32, N0),
54989 DAG.getBitcast(MVT::v4f32, N1)));
54990 }
54991
54992 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
54993 return Cmp;
54994
54995 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
54996 return R;
54997
54998 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
54999 return R;
55000
55001 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55002 return R;
55003
55004 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55005 DAG, DCI, Subtarget))
55006 return FPLogic;
55007
55008 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55009 return R;
55010
55011 if (DCI.isBeforeLegalizeOps())
55012 return SDValue();
55013
55014 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55015 return SetCC;
55016
55017 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55018 return R;
55019
55020 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55021 return RV;
55022
55023 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55024 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55025 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55026 N0.getOperand(0).getValueType().isVector() &&
55027 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55028 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55029 return DAG.getBitcast(
55030 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55031 }
55032
55033 // Handle AVX512 mask widening.
55034 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55035 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55036 VT.getVectorElementType() == MVT::i1 &&
55038 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55039 return DAG.getNode(
55041 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55042 N0.getOperand(2));
55043 }
55044
55045 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55046 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55047 // TODO: Under what circumstances could this be performed in DAGCombine?
55048 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55049 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55050 SDValue TruncExtSrc = N0.getOperand(0);
55051 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55052 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55053 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55054 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55055 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55056 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55057 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55058 }
55059 }
55060
55061 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55062 return R;
55063
55064 return combineFneg(N, DAG, DCI, Subtarget);
55065}
55066
55069 const X86Subtarget &Subtarget) {
55070 SDValue N0 = N->getOperand(0);
55071 EVT VT = N->getValueType(0);
55072
55073 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55074 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55075 SDValue Src = N0.getOperand(0);
55076 EVT SrcVT = Src.getValueType();
55077 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55078 (DCI.isBeforeLegalize() ||
55079 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55080 Subtarget.hasSSSE3()) {
55081 unsigned NumElts = SrcVT.getVectorNumElements();
55082 SmallVector<int, 32> ReverseMask(NumElts);
55083 for (unsigned I = 0; I != NumElts; ++I)
55084 ReverseMask[I] = (NumElts - 1) - I;
55085 SDValue Rev =
55086 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55087 return DAG.getBitcast(VT, Rev);
55088 }
55089 }
55090
55091 return SDValue();
55092}
55093
55094// Various combines to try to convert to avgceilu.
55097 const X86Subtarget &Subtarget) {
55098 unsigned Opcode = N->getOpcode();
55099 SDValue N0 = N->getOperand(0);
55100 SDValue N1 = N->getOperand(1);
55101 EVT VT = N->getValueType(0);
55102 EVT SVT = VT.getScalarType();
55103 SDLoc DL(N);
55104
55105 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55106 // Only useful on vXi8 which doesn't have good SRA handling.
55107 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55109 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55110 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55111 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55112 return DAG.getNode(ISD::XOR, DL, VT,
55113 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55114 }
55115
55116 return SDValue();
55117}
55118
55121 const X86Subtarget &Subtarget) {
55122 EVT VT = N->getValueType(0);
55123 unsigned NumBits = VT.getSizeInBits();
55124
55125 // TODO - Constant Folding.
55126
55127 // Simplify the inputs.
55128 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55129 APInt DemandedMask(APInt::getAllOnes(NumBits));
55130 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55131 return SDValue(N, 0);
55132
55133 return SDValue();
55134}
55135
55137 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55138}
55139
55140/// If a value is a scalar FP zero or a vector FP zero (potentially including
55141/// undefined elements), return a zero constant that may be used to fold away
55142/// that value. In the case of a vector, the returned constant will not contain
55143/// undefined elements even if the input parameter does. This makes it suitable
55144/// to be used as a replacement operand with operations (eg, bitwise-and) where
55145/// an undef should not propagate.
55147 const X86Subtarget &Subtarget) {
55149 return SDValue();
55150
55151 if (V.getValueType().isVector())
55152 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55153
55154 return V;
55155}
55156
55158 const X86Subtarget &Subtarget) {
55159 SDValue N0 = N->getOperand(0);
55160 SDValue N1 = N->getOperand(1);
55161 EVT VT = N->getValueType(0);
55162 SDLoc DL(N);
55163
55164 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55165 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55166 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55167 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55168 return SDValue();
55169
55170 auto isAllOnesConstantFP = [](SDValue V) {
55171 if (V.getSimpleValueType().isVector())
55172 return ISD::isBuildVectorAllOnes(V.getNode());
55173 auto *C = dyn_cast<ConstantFPSDNode>(V);
55174 return C && C->getConstantFPValue()->isAllOnesValue();
55175 };
55176
55177 // fand (fxor X, -1), Y --> fandn X, Y
55178 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55179 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55180
55181 // fand X, (fxor Y, -1) --> fandn Y, X
55182 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55183 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55184
55185 return SDValue();
55186}
55187
55188/// Do target-specific dag combines on X86ISD::FAND nodes.
55190 const X86Subtarget &Subtarget) {
55191 // FAND(0.0, x) -> 0.0
55192 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55193 return V;
55194
55195 // FAND(x, 0.0) -> 0.0
55196 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55197 return V;
55198
55199 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55200 return V;
55201
55202 return lowerX86FPLogicOp(N, DAG, Subtarget);
55203}
55204
55205/// Do target-specific dag combines on X86ISD::FANDN nodes.
55207 const X86Subtarget &Subtarget) {
55208 // FANDN(0.0, x) -> x
55209 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55210 return N->getOperand(1);
55211
55212 // FANDN(x, 0.0) -> 0.0
55213 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55214 return V;
55215
55216 return lowerX86FPLogicOp(N, DAG, Subtarget);
55217}
55218
55219/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55222 const X86Subtarget &Subtarget) {
55223 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55224
55225 // F[X]OR(0.0, x) -> x
55226 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55227 return N->getOperand(1);
55228
55229 // F[X]OR(x, 0.0) -> x
55230 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55231 return N->getOperand(0);
55232
55233 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55234 return NewVal;
55235
55236 return lowerX86FPLogicOp(N, DAG, Subtarget);
55237}
55238
55239/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55241 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55242
55243 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55244 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55246 return SDValue();
55247
55248 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55249 // into FMINC and FMAXC, which are Commutative operations.
55250 unsigned NewOp = 0;
55251 switch (N->getOpcode()) {
55252 default: llvm_unreachable("unknown opcode");
55253 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55254 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55255 }
55256
55257 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55258 N->getOperand(0), N->getOperand(1));
55259}
55260
55262 const X86Subtarget &Subtarget) {
55263 EVT VT = N->getValueType(0);
55264 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55265 return SDValue();
55266
55267 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55268
55269 auto IsMinMaxLegal = [&](EVT VT) {
55270 if (!TLI.isTypeLegal(VT))
55271 return false;
55272 return VT.getScalarType() != MVT::f16 ||
55273 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55274 };
55275
55276 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55277 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55278 (Subtarget.hasFP16() && VT == MVT::f16) ||
55279 (VT.isVector() && IsMinMaxLegal(VT))))
55280 return SDValue();
55281
55282 SDValue Op0 = N->getOperand(0);
55283 SDValue Op1 = N->getOperand(1);
55284 SDLoc DL(N);
55285 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55286
55287 // If we don't have to respect NaN inputs, this is a direct translation to x86
55288 // min/max instructions.
55289 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55290 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55291
55292 // If one of the operands is known non-NaN use the native min/max instructions
55293 // with the non-NaN input as second operand.
55294 if (DAG.isKnownNeverNaN(Op1))
55295 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55296 if (DAG.isKnownNeverNaN(Op0))
55297 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55298
55299 // If we have to respect NaN inputs, this takes at least 3 instructions.
55300 // Favor a library call when operating on a scalar and minimizing code size.
55301 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55302 return SDValue();
55303
55304 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55305 VT);
55306
55307 // There are 4 possibilities involving NaN inputs, and these are the required
55308 // outputs:
55309 // Op1
55310 // Num NaN
55311 // ----------------
55312 // Num | Max | Op0 |
55313 // Op0 ----------------
55314 // NaN | Op1 | NaN |
55315 // ----------------
55316 //
55317 // The SSE FP max/min instructions were not designed for this case, but rather
55318 // to implement:
55319 // Min = Op1 < Op0 ? Op1 : Op0
55320 // Max = Op1 > Op0 ? Op1 : Op0
55321 //
55322 // So they always return Op0 if either input is a NaN. However, we can still
55323 // use those instructions for fmaxnum by selecting away a NaN input.
55324
55325 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55326 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55327 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55328
55329 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55330 // are NaN, the NaN value of Op1 is the result.
55331 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55332}
55333
55336 EVT VT = N->getValueType(0);
55337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55338
55339 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55340 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55341 return SDValue(N, 0);
55342
55343 // Convert a full vector load into vzload when not all bits are needed.
55344 SDValue In = N->getOperand(0);
55345 MVT InVT = In.getSimpleValueType();
55346 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55347 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55348 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55349 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55350 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55351 MVT MemVT = MVT::getIntegerVT(NumBits);
55352 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55353 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55354 SDLoc dl(N);
55355 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55356 DAG.getBitcast(InVT, VZLoad));
55357 DCI.CombineTo(N, Convert);
55358 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55360 return SDValue(N, 0);
55361 }
55362 }
55363
55364 return SDValue();
55365}
55366
55370 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55371 EVT VT = N->getValueType(0);
55372
55373 // Convert a full vector load into vzload when not all bits are needed.
55374 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55375 MVT InVT = In.getSimpleValueType();
55376 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55377 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55378 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55379 LoadSDNode *LN = cast<LoadSDNode>(In);
55380 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55381 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55382 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55383 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55384 SDLoc dl(N);
55385 if (IsStrict) {
55386 SDValue Convert =
55387 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55388 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55389 DCI.CombineTo(N, Convert, Convert.getValue(1));
55390 } else {
55391 SDValue Convert =
55392 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55393 DCI.CombineTo(N, Convert);
55394 }
55395 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55397 return SDValue(N, 0);
55398 }
55399 }
55400
55401 return SDValue();
55402}
55403
55404/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55407 const X86Subtarget &Subtarget) {
55408 SDValue N0 = N->getOperand(0);
55409 SDValue N1 = N->getOperand(1);
55410 MVT VT = N->getSimpleValueType(0);
55411 int NumElts = VT.getVectorNumElements();
55412 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55413 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55414 SDLoc DL(N);
55415
55416 // ANDNP(undef, x) -> 0
55417 // ANDNP(x, undef) -> 0
55418 if (N0.isUndef() || N1.isUndef())
55419 return DAG.getConstant(0, DL, VT);
55420
55421 // ANDNP(0, x) -> x
55423 return N1;
55424
55425 // ANDNP(x, 0) -> 0
55427 return DAG.getConstant(0, DL, VT);
55428
55429 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55431 return DAG.getNOT(DL, N0, VT);
55432
55433 // Turn ANDNP back to AND if input is inverted.
55434 if (SDValue Not = IsNOT(N0, DAG))
55435 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55436
55437 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55438 // to make use of predicated selects.
55439 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55440 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55441 SDValue Src = N0.getOperand(0);
55442 EVT SrcVT = Src.getValueType();
55443 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55444 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55445 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55446 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55447 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55448 getZeroVector(VT, Subtarget, DAG, DL));
55449 }
55450
55451 // Constant Folding
55452 APInt Undefs0, Undefs1;
55453 SmallVector<APInt> EltBits0, EltBits1;
55454 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55455 /*AllowWholeUndefs*/ true,
55456 /*AllowPartialUndefs*/ true)) {
55457 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55458 /*AllowWholeUndefs*/ true,
55459 /*AllowPartialUndefs*/ true)) {
55460 SmallVector<APInt> ResultBits;
55461 for (int I = 0; I != NumElts; ++I)
55462 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55463 return getConstVector(ResultBits, VT, DAG, DL);
55464 }
55465
55466 // Constant fold NOT(N0) to allow us to use AND.
55467 // Ensure this is only performed if we can confirm that the bitcasted source
55468 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55469 if (N0->hasOneUse()) {
55471 if (BC0.getOpcode() != ISD::BITCAST) {
55472 for (APInt &Elt : EltBits0)
55473 Elt = ~Elt;
55474 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55475 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55476 }
55477 }
55478 }
55479
55480 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55481 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55482 SDValue Op(N, 0);
55483 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55484 return Res;
55485
55486 // If either operand is a constant mask, then only the elements that aren't
55487 // zero are actually demanded by the other operand.
55488 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55489 APInt UndefElts;
55490 SmallVector<APInt> EltBits;
55491 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55492 APInt DemandedElts = APInt::getAllOnes(NumElts);
55493 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55494 EltBits)) {
55495 DemandedBits.clearAllBits();
55496 DemandedElts.clearAllBits();
55497 for (int I = 0; I != NumElts; ++I) {
55498 if (UndefElts[I]) {
55499 // We can't assume an undef src element gives an undef dst - the
55500 // other src might be zero.
55501 DemandedBits.setAllBits();
55502 DemandedElts.setBit(I);
55503 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55504 (!Invert && !EltBits[I].isZero())) {
55505 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55506 DemandedElts.setBit(I);
55507 }
55508 }
55509 }
55510 return std::make_pair(DemandedBits, DemandedElts);
55511 };
55512 APInt Bits0, Elts0;
55513 APInt Bits1, Elts1;
55514 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55515 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55516
55517 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55518 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55519 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55520 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55521 if (N->getOpcode() != ISD::DELETED_NODE)
55522 DCI.AddToWorklist(N);
55523 return SDValue(N, 0);
55524 }
55525 }
55526
55527 // Folds for better commutativity:
55528 if (N1->hasOneUse()) {
55529 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55530 if (SDValue Not = IsNOT(N1, DAG))
55531 return DAG.getNOT(
55532 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55533
55534 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55535 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55536 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55538 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55539 EVT ShufVT = BC1.getValueType();
55540 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55541 DAG.getBitcast(ShufVT, N0));
55542 SDValue NewShuf =
55543 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55544 return DAG.getBitcast(VT, NewShuf);
55545 }
55546 }
55547 }
55548
55549 return SDValue();
55550}
55551
55554 SDValue N1 = N->getOperand(1);
55555
55556 // BT ignores high bits in the bit index operand.
55557 unsigned BitWidth = N1.getValueSizeInBits();
55559 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55560 if (N->getOpcode() != ISD::DELETED_NODE)
55561 DCI.AddToWorklist(N);
55562 return SDValue(N, 0);
55563 }
55564
55565 return SDValue();
55566}
55567
55570 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55571 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55572
55573 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55575 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55576 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55577 if (N->getOpcode() != ISD::DELETED_NODE)
55578 DCI.AddToWorklist(N);
55579 return SDValue(N, 0);
55580 }
55581
55582 // Convert a full vector load into vzload when not all bits are needed.
55583 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55584 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55585 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55586 SDLoc dl(N);
55587 if (IsStrict) {
55588 SDValue Convert = DAG.getNode(
55589 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55590 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55591 DCI.CombineTo(N, Convert, Convert.getValue(1));
55592 } else {
55593 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55594 DAG.getBitcast(MVT::v8i16, VZLoad));
55595 DCI.CombineTo(N, Convert);
55596 }
55597
55598 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55600 return SDValue(N, 0);
55601 }
55602 }
55603 }
55604
55605 return SDValue();
55606}
55607
55608// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55610 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55611
55612 EVT DstVT = N->getValueType(0);
55613
55614 SDValue N0 = N->getOperand(0);
55615 SDValue N1 = N->getOperand(1);
55616 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55617
55618 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55619 return SDValue();
55620
55621 // Look through single use any_extends / truncs.
55622 SDValue IntermediateBitwidthOp;
55623 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55624 N0.hasOneUse()) {
55625 IntermediateBitwidthOp = N0;
55626 N0 = N0.getOperand(0);
55627 }
55628
55629 // See if we have a single use cmov.
55630 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55631 return SDValue();
55632
55633 SDValue CMovOp0 = N0.getOperand(0);
55634 SDValue CMovOp1 = N0.getOperand(1);
55635
55636 // Make sure both operands are constants.
55637 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55638 !isa<ConstantSDNode>(CMovOp1.getNode()))
55639 return SDValue();
55640
55641 SDLoc DL(N);
55642
55643 // If we looked through an any_extend/trunc above, add one to the constants.
55644 if (IntermediateBitwidthOp) {
55645 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55646 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55647 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55648 }
55649
55650 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55651 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55652
55653 EVT CMovVT = DstVT;
55654 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55655 if (DstVT == MVT::i16) {
55656 CMovVT = MVT::i32;
55657 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55658 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55659 }
55660
55661 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55662 N0.getOperand(2), N0.getOperand(3));
55663
55664 if (CMovVT != DstVT)
55665 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55666
55667 return CMov;
55668}
55669
55671 const X86Subtarget &Subtarget) {
55672 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55673
55674 if (SDValue V = combineSextInRegCmov(N, DAG))
55675 return V;
55676
55677 EVT VT = N->getValueType(0);
55678 SDValue N0 = N->getOperand(0);
55679 SDValue N1 = N->getOperand(1);
55680 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55681 SDLoc dl(N);
55682
55683 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55684 // both SSE and AVX2 since there is no sign-extended shift right
55685 // operation on a vector with 64-bit elements.
55686 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55687 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55688 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55689 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55690 SDValue N00 = N0.getOperand(0);
55691
55692 // EXTLOAD has a better solution on AVX2,
55693 // it may be replaced with X86ISD::VSEXT node.
55694 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55695 if (!ISD::isNormalLoad(N00.getNode()))
55696 return SDValue();
55697
55698 // Attempt to promote any comparison mask ops before moving the
55699 // SIGN_EXTEND_INREG in the way.
55700 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55701 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55702
55703 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55704 SDValue Tmp =
55705 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55706 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55707 }
55708 }
55709 return SDValue();
55710}
55711
55712/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55713/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55714/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55715/// opportunities to combine math ops, use an LEA, or use a complex addressing
55716/// mode. This can eliminate extend, add, and shift instructions.
55718 const X86Subtarget &Subtarget) {
55719 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55720 Ext->getOpcode() != ISD::ZERO_EXTEND)
55721 return SDValue();
55722
55723 // TODO: This should be valid for other integer types.
55724 EVT VT = Ext->getValueType(0);
55725 if (VT != MVT::i64)
55726 return SDValue();
55727
55728 SDValue Add = Ext->getOperand(0);
55729 if (Add.getOpcode() != ISD::ADD)
55730 return SDValue();
55731
55732 SDValue AddOp0 = Add.getOperand(0);
55733 SDValue AddOp1 = Add.getOperand(1);
55734 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55735 bool NSW = Add->getFlags().hasNoSignedWrap();
55736 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55737 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55738 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55739
55740 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55741 // into the 'zext'
55742 if ((Sext && !NSW) || (!Sext && !NUW))
55743 return SDValue();
55744
55745 // Having a constant operand to the 'add' ensures that we are not increasing
55746 // the instruction count because the constant is extended for free below.
55747 // A constant operand can also become the displacement field of an LEA.
55748 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55749 if (!AddOp1C)
55750 return SDValue();
55751
55752 // Don't make the 'add' bigger if there's no hope of combining it with some
55753 // other 'add' or 'shl' instruction.
55754 // TODO: It may be profitable to generate simpler LEA instructions in place
55755 // of single 'add' instructions, but the cost model for selecting an LEA
55756 // currently has a high threshold.
55757 bool HasLEAPotential = false;
55758 for (auto *User : Ext->users()) {
55759 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55760 HasLEAPotential = true;
55761 break;
55762 }
55763 }
55764 if (!HasLEAPotential)
55765 return SDValue();
55766
55767 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55768 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55769 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55770 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55771
55772 // The wider add is guaranteed to not wrap because both operands are
55773 // sign-extended.
55774 SDNodeFlags Flags;
55775 Flags.setNoSignedWrap(NSW);
55776 Flags.setNoUnsignedWrap(NUW);
55777 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55778}
55779
55780// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55781// operands and the result of CMOV is not used anywhere else - promote CMOV
55782// itself instead of promoting its result. This could be beneficial, because:
55783// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55784// (or more) pseudo-CMOVs only when they go one-after-another and
55785// getting rid of result extension code after CMOV will help that.
55786// 2) Promotion of constant CMOV arguments is free, hence the
55787// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55788// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55789// promotion is also good in terms of code-size.
55790// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55791// promotion).
55793 SDValue CMovN = Extend->getOperand(0);
55794 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55795 return SDValue();
55796
55797 EVT TargetVT = Extend->getValueType(0);
55798 unsigned ExtendOpcode = Extend->getOpcode();
55799 SDLoc DL(Extend);
55800
55801 EVT VT = CMovN.getValueType();
55802 SDValue CMovOp0 = CMovN.getOperand(0);
55803 SDValue CMovOp1 = CMovN.getOperand(1);
55804
55805 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55806 !isa<ConstantSDNode>(CMovOp1.getNode()))
55807 return SDValue();
55808
55809 // Only extend to i32 or i64.
55810 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55811 return SDValue();
55812
55813 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55814 // are free.
55815 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55816 return SDValue();
55817
55818 // If this a zero extend to i64, we should only extend to i32 and use a free
55819 // zero extend to finish.
55820 EVT ExtendVT = TargetVT;
55821 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55822 ExtendVT = MVT::i32;
55823
55824 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55825 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55826
55827 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55828 CMovN.getOperand(2), CMovN.getOperand(3));
55829
55830 // Finish extending if needed.
55831 if (ExtendVT != TargetVT)
55832 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55833
55834 return Res;
55835}
55836
55837// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55838// result type.
55840 const X86Subtarget &Subtarget) {
55841 SDValue N0 = N->getOperand(0);
55842 EVT VT = N->getValueType(0);
55843 SDLoc dl(N);
55844
55845 // Only do this combine with AVX512 for vector extends.
55846 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55847 return SDValue();
55848
55849 // Only combine legal element types.
55850 EVT SVT = VT.getVectorElementType();
55851 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55852 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55853 return SDValue();
55854
55855 // We don't have CMPP Instruction for vxf16
55856 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55857 return SDValue();
55858 // We can only do this if the vector size in 256 bits or less.
55859 unsigned Size = VT.getSizeInBits();
55860 if (Size > 256 && Subtarget.useAVX512Regs())
55861 return SDValue();
55862
55863 EVT N00VT = N0.getOperand(0).getValueType();
55864
55865 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55866 // that's the only integer compares with we have.
55868 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55869 return SDValue();
55870
55871 // Only do this combine if the extension will be fully consumed by the setcc.
55872 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55873 if (Size != MatchingVecType.getSizeInBits())
55874 return SDValue();
55875
55876 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55877
55878 if (N->getOpcode() == ISD::ZERO_EXTEND)
55879 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55880
55881 return Res;
55882}
55883
55886 const X86Subtarget &Subtarget) {
55887 SDValue N0 = N->getOperand(0);
55888 EVT VT = N->getValueType(0);
55889 SDLoc DL(N);
55890
55891 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55892 if (!DCI.isBeforeLegalizeOps() &&
55894 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55895 N0->getOperand(1));
55896 bool ReplaceOtherUses = !N0.hasOneUse();
55897 DCI.CombineTo(N, Setcc);
55898 // Replace other uses with a truncate of the widened setcc_carry.
55899 if (ReplaceOtherUses) {
55900 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55901 N0.getValueType(), Setcc);
55902 DCI.CombineTo(N0.getNode(), Trunc);
55903 }
55904
55905 return SDValue(N, 0);
55906 }
55907
55908 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55909 return NewCMov;
55910
55911 if (!DCI.isBeforeLegalizeOps())
55912 return SDValue();
55913
55914 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55915 return V;
55916
55917 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55918 DAG, DCI, Subtarget))
55919 return V;
55920
55921 if (VT.isVector()) {
55922 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55923 return R;
55924
55926 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55927 }
55928
55929 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55930 return NewAdd;
55931
55932 return SDValue();
55933}
55934
55935// Inverting a constant vector is profitable if it can be eliminated and the
55936// inverted vector is already present in DAG. Otherwise, it will be loaded
55937// anyway.
55938//
55939// We determine which of the values can be completely eliminated and invert it.
55940// If both are eliminable, select a vector with the first negative element.
55943 "ConstantFP build vector expected");
55944 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
55945 // can eliminate it. Since this function is invoked for each FMA with this
55946 // vector.
55947 auto IsNotFMA = [](SDNode *User) {
55948 return User->getOpcode() != ISD::FMA &&
55949 User->getOpcode() != ISD::STRICT_FMA;
55950 };
55951 if (llvm::any_of(V->users(), IsNotFMA))
55952 return SDValue();
55953
55955 EVT VT = V.getValueType();
55956 EVT EltVT = VT.getVectorElementType();
55957 for (const SDValue &Op : V->op_values()) {
55958 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55959 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
55960 } else {
55961 assert(Op.isUndef());
55962 Ops.push_back(DAG.getUNDEF(EltVT));
55963 }
55964 }
55965
55967 if (!NV)
55968 return SDValue();
55969
55970 // If an inverted version cannot be eliminated, choose it instead of the
55971 // original version.
55972 if (llvm::any_of(NV->users(), IsNotFMA))
55973 return SDValue(NV, 0);
55974
55975 // If the inverted version also can be eliminated, we have to consistently
55976 // prefer one of the values. We prefer a constant with a negative value on
55977 // the first place.
55978 // N.B. We need to skip undefs that may precede a value.
55979 for (const SDValue &Op : V->op_values()) {
55980 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
55981 if (Cst->isNegative())
55982 return SDValue();
55983 break;
55984 }
55985 }
55986 return SDValue(NV, 0);
55987}
55988
55991 const X86Subtarget &Subtarget) {
55992 SDLoc dl(N);
55993 EVT VT = N->getValueType(0);
55995 bool IsStrict = N->isTargetOpcode()
55996 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
55997 : N->isStrictFPOpcode();
55998
55999 // Let legalize expand this if it isn't a legal type yet.
56000 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56001 if (!TLI.isTypeLegal(VT))
56002 return SDValue();
56003
56004 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56005 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56006 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56007
56008 // If the operation allows fast-math and the target does not support FMA,
56009 // split this into mul+add to avoid libcall(s).
56010 SDNodeFlags Flags = N->getFlags();
56011 if (!IsStrict && Flags.hasAllowReassociation() &&
56012 TLI.isOperationExpand(ISD::FMA, VT)) {
56013 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56014 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56015 }
56016
56017 EVT ScalarVT = VT.getScalarType();
56018 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56019 !Subtarget.hasAnyFMA()) &&
56020 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56021 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56022 return SDValue();
56023
56024 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56026 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56027 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56028 CodeSize)) {
56029 V = NegV;
56030 return true;
56031 }
56032 // Look through extract_vector_elts. If it comes from an FNEG, create a
56033 // new extract from the FNEG input.
56034 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56035 isNullConstant(V.getOperand(1))) {
56036 SDValue Vec = V.getOperand(0);
56037 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56038 Vec, DAG, LegalOperations, CodeSize)) {
56039 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56040 NegV, V.getOperand(1));
56041 return true;
56042 }
56043 }
56044 // Lookup if there is an inverted version of constant vector V in DAG.
56045 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56046 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56047 V = NegV;
56048 return true;
56049 }
56050 }
56051 return false;
56052 };
56053
56054 // Do not convert the passthru input of scalar intrinsics.
56055 // FIXME: We could allow negations of the lower element only.
56056 bool NegA = invertIfNegative(A);
56057 // Create a dummy use for A so that in the process of negating B or C
56058 // recursively, it is not deleted.
56059 HandleSDNode NegAHandle(A);
56060 bool NegB = invertIfNegative(B);
56061 // Similar to A, get a handle on B.
56062 HandleSDNode NegBHandle(B);
56063 bool NegC = invertIfNegative(C);
56064
56065 if (!NegA && !NegB && !NegC)
56066 return SDValue();
56067
56068 unsigned NewOpcode =
56069 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56070
56071 // Propagate fast-math-flags to new FMA node.
56072 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56073 if (IsStrict) {
56074 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56075 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56076 {N->getOperand(0), A, B, C});
56077 } else {
56078 if (N->getNumOperands() == 4)
56079 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56080 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56081 }
56082}
56083
56084// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56085// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56088 SDLoc dl(N);
56089 EVT VT = N->getValueType(0);
56090 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56092 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56093
56094 SDValue N2 = N->getOperand(2);
56095
56096 SDValue NegN2 =
56097 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56098 if (!NegN2)
56099 return SDValue();
56100 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56101
56102 if (N->getNumOperands() == 4)
56103 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56104 NegN2, N->getOperand(3));
56105 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56106 NegN2);
56107}
56108
56109// Try to widen the build vector and bitcast it to the type of zext.
56110// This is a special case for the 128-bit vector types. Intention is to remove
56111// the zext and replace it with a bitcast the wider type. While lowering
56112// the bitcast is removed and extra commutation due to zext is avoided.
56113// For example:
56114// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56115// build_vector (x, 0, y, 0, z, w, 0)
56117
56118 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56119 return SDValue();
56120
56121 EVT ExtendVT = Extend->getValueType(0);
56122
56123 SDValue BV = Extend->getOperand(0);
56124 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56125 return SDValue();
56126
56127 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56128 // If the build vector has undef elements, we cannot widen it.
56129 // The widening would create a vector with more undef elements, which
56130 // is not valid.
56131 return SDValue();
56132 }
56133
56134 if (!all_of(BV->op_values(),
56135 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56136 // If the build vector any element other than \ISD::LOAD, we cannot widen
56137 // it.
56138 return SDValue();
56139 }
56140
56141 SDLoc dl(BV);
56142 EVT VT = BV.getValueType();
56143 EVT EltVT = BV.getOperand(0).getValueType();
56144 unsigned NumElts = VT.getVectorNumElements();
56145
56146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56147
56148 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56150 return SDValue();
56151
56152 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56153 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56154
56155 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56156 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56157 // Fill the new elements with Zero.
56158 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56159 // Compute the step to place the elements in the right place and control the
56160 // iteration.
56161 unsigned step = WidenNumElts / NumElts;
56162 if (WidenVT.is128BitVector()) {
56163 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56164 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56165 i--, j -= step) {
56166 SDValue temp = NewOps[i];
56167 NewOps[i] = NewOps[j];
56168 NewOps[j] = temp;
56169 }
56170 // Create new build vector with WidenVT and NewOps
56171 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56172 // Replace the old build vector with the new one. Bitcast the
56173 // new build vector to the type of the zext.
56174 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56175 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56176 return NewBV;
56177 }
56178 }
56179 return SDValue();
56180}
56181
56184 const X86Subtarget &Subtarget) {
56185 SDLoc dl(N);
56186 SDValue N0 = N->getOperand(0);
56187 EVT VT = N->getValueType(0);
56188
56189 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56190 // FIXME: Is this needed? We don't seem to have any tests for it.
56191 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56193 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56194 N0->getOperand(1));
56195 bool ReplaceOtherUses = !N0.hasOneUse();
56196 DCI.CombineTo(N, Setcc);
56197 // Replace other uses with a truncate of the widened setcc_carry.
56198 if (ReplaceOtherUses) {
56199 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56200 N0.getValueType(), Setcc);
56201 DCI.CombineTo(N0.getNode(), Trunc);
56202 }
56203
56204 return SDValue(N, 0);
56205 }
56206
56207 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56208 return NewCMov;
56209
56210 if (DCI.isBeforeLegalizeOps())
56211 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56212 return V;
56213
56214 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56215 DAG, DCI, Subtarget))
56216 return V;
56217
56218 if (VT.isVector())
56219 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56220 return R;
56221
56222 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56223 return NewAdd;
56224
56225 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56226 return R;
56227
56228 // TODO: Combine with any target/faux shuffle.
56229 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56231 SDValue N00 = N0.getOperand(0);
56232 SDValue N01 = N0.getOperand(1);
56233 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56234 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56235 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56236 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56237 return concatSubVectors(N00, N01, DAG, dl);
56238 }
56239 }
56240
56241 if (SDValue V = widenBuildVec(N, DAG))
56242 return V;
56243
56244 return SDValue();
56245}
56246
56247/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56248/// pre-promote its result type since vXi1 vectors don't get promoted
56249/// during type legalization.
56252 const SDLoc &DL, SelectionDAG &DAG,
56253 const X86Subtarget &Subtarget) {
56254 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56255 VT.getVectorElementType() == MVT::i1 &&
56256 (OpVT.getVectorElementType() == MVT::i8 ||
56257 OpVT.getVectorElementType() == MVT::i16)) {
56258 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56259 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56260 }
56261 return SDValue();
56262}
56263
56264// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56265// eq/ne) is generated when using an integer as a mask. Instead of generating a
56266// broadcast + vptest, we can directly move the integer to a mask register.
56268 const SDLoc &DL, SelectionDAG &DAG,
56269 const X86Subtarget &Subtarget) {
56270 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56271 return SDValue();
56272
56273 if (!Subtarget.hasAVX512())
56274 return SDValue();
56275
56276 if (Op0.getOpcode() != ISD::AND)
56277 return SDValue();
56278
56279 SDValue Broadcast = Op0.getOperand(0);
56280 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56281 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56282 return SDValue();
56283
56284 SDValue Load = Op0.getOperand(1);
56285 EVT LoadVT = Load.getSimpleValueType();
56286
56287 APInt UndefElts;
56288 SmallVector<APInt, 32> EltBits;
56290 UndefElts, EltBits,
56291 /*AllowWholeUndefs*/ true,
56292 /*AllowPartialUndefs*/ false) ||
56293 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56294 return SDValue();
56295
56296 // Check if the constant pool contains only powers of 2 starting from some
56297 // 2^N. The table may also contain undefs because of widening of vector
56298 // operands.
56299 unsigned N = EltBits[0].logBase2();
56300 unsigned Len = UndefElts.getBitWidth();
56301 for (unsigned I = 1; I != Len; ++I) {
56302 if (UndefElts[I]) {
56303 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56304 return SDValue();
56305 break;
56306 }
56307
56308 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56309 return SDValue();
56310 }
56311
56312 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56313 SDValue BroadcastOp;
56314 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56315 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56316 Broadcast, DAG.getVectorIdxConstant(0, DL));
56317 } else {
56318 BroadcastOp = Broadcast.getOperand(0);
56319 if (BroadcastOp.getValueType().isVector())
56320 return SDValue();
56321 }
56322
56323 SDValue Masked = BroadcastOp;
56324 if (N != 0) {
56325 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56326 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56327
56328 if (NumDefinedElts > BroadcastOpBitWidth)
56329 return SDValue();
56330
56331 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56332 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56333 DAG.getConstant(N, DL, BroadcastOpVT));
56334 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56335 DAG.getConstant(Mask, DL, BroadcastOpVT));
56336 }
56337 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56338 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56339 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56340 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56341
56342 if (CC == ISD::SETEQ)
56343 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56344
56345 if (VT != MVT::v16i1)
56346 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56347 DAG.getVectorIdxConstant(0, DL));
56348
56349 return Bitcast;
56350}
56351
56354 const X86Subtarget &Subtarget) {
56355 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56356 const SDValue LHS = N->getOperand(0);
56357 const SDValue RHS = N->getOperand(1);
56358 EVT VT = N->getValueType(0);
56359 EVT OpVT = LHS.getValueType();
56360 SDLoc DL(N);
56361
56362 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56363 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56364 Subtarget))
56365 return V;
56366 }
56367
56368 if (VT == MVT::i1) {
56369 X86::CondCode X86CC;
56370 if (SDValue V =
56371 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56372 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56373 }
56374
56375 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56376 if (OpVT.isScalarInteger()) {
56377 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56378 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56379 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56380 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56381 if (N0.getOperand(0) == N1)
56382 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56383 N0.getOperand(1));
56384 if (N0.getOperand(1) == N1)
56385 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56386 N0.getOperand(0));
56387 }
56388 return SDValue();
56389 };
56390 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56391 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56392 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56393 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56394
56395 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56396 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56397 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56398 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56399 if (N0.getOperand(0) == N1)
56400 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56401 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56402 if (N0.getOperand(1) == N1)
56403 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56404 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56405 }
56406 return SDValue();
56407 };
56408 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56409 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56410 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56411 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56412
56413 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56414 // cmpne(trunc(x),C) --> cmpne(x,C)
56415 // iff x upper bits are zero.
56416 if (LHS.getOpcode() == ISD::TRUNCATE &&
56417 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56419 EVT SrcVT = LHS.getOperand(0).getValueType();
56421 OpVT.getScalarSizeInBits());
56422 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56423 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56424 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56425 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56426 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56427 }
56428
56429 // With C as a power of 2 and C != 0 and C != INT_MIN:
56430 // icmp eq Abs(X) C ->
56431 // (icmp eq A, C) | (icmp eq A, -C)
56432 // icmp ne Abs(X) C ->
56433 // (icmp ne A, C) & (icmp ne A, -C)
56434 // Both of these patterns can be better optimized in
56435 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56436 // integers which is checked above.
56437 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56438 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56439 const APInt &CInt = C->getAPIntValue();
56440 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56441 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56442 SDValue BaseOp = LHS.getOperand(0);
56443 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56444 SDValue SETCC1 = DAG.getSetCC(
56445 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56446 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56447 SETCC0, SETCC1);
56448 }
56449 }
56450 }
56451 }
56452 }
56453
56454 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56455 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56456 // Using temporaries to avoid messing up operand ordering for later
56457 // transformations if this doesn't work.
56458 SDValue Op0 = LHS;
56459 SDValue Op1 = RHS;
56460 ISD::CondCode TmpCC = CC;
56461 // Put build_vector on the right.
56462 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56463 std::swap(Op0, Op1);
56464 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56465 }
56466
56467 bool IsSEXT0 =
56468 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56469 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56470 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56471
56472 if (IsSEXT0 && IsVZero1) {
56473 assert(VT == Op0.getOperand(0).getValueType() &&
56474 "Unexpected operand type");
56475 if (TmpCC == ISD::SETGT)
56476 return DAG.getConstant(0, DL, VT);
56477 if (TmpCC == ISD::SETLE)
56478 return DAG.getConstant(1, DL, VT);
56479 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56480 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56481
56482 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56483 "Unexpected condition code!");
56484 return Op0.getOperand(0);
56485 }
56486
56487 if (IsVZero1)
56488 if (SDValue V =
56489 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56490 return V;
56491 }
56492
56493 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56494 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56495 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56496 // a mask, there are signed AVX512 comparisons).
56497 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56498 bool CanMakeSigned = false;
56499 if (ISD::isUnsignedIntSetCC(CC)) {
56500 KnownBits CmpKnown =
56502 // If we know LHS/RHS share the same sign bit at each element we can
56503 // make this signed.
56504 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56505 // across all lanes. So a pattern where the sign varies from lane to
56506 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56507 // missed. We could get around this by demanding each lane
56508 // independently, but this isn't the most important optimization and
56509 // that may eat into compile time.
56510 CanMakeSigned =
56511 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56512 }
56513 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56514 SDValue LHSOut = LHS;
56515 SDValue RHSOut = RHS;
56516 ISD::CondCode NewCC = CC;
56517 switch (CC) {
56518 case ISD::SETGE:
56519 case ISD::SETUGE:
56520 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56521 /*NSW*/ true))
56522 LHSOut = NewLHS;
56523 else if (SDValue NewRHS = incDecVectorConstant(
56524 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56525 RHSOut = NewRHS;
56526 else
56527 break;
56528
56529 [[fallthrough]];
56530 case ISD::SETUGT:
56531 NewCC = ISD::SETGT;
56532 break;
56533
56534 case ISD::SETLE:
56535 case ISD::SETULE:
56536 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56537 /*NSW*/ true))
56538 LHSOut = NewLHS;
56539 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56540 /*NSW*/ true))
56541 RHSOut = NewRHS;
56542 else
56543 break;
56544
56545 [[fallthrough]];
56546 case ISD::SETULT:
56547 // Will be swapped to SETGT in LowerVSETCC*.
56548 NewCC = ISD::SETLT;
56549 break;
56550 default:
56551 break;
56552 }
56553 if (NewCC != CC) {
56554 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56555 NewCC, DL, DAG, Subtarget))
56556 return R;
56557 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56558 }
56559 }
56560 }
56561
56562 if (SDValue R =
56563 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56564 return R;
56565
56566 // In the middle end transforms:
56567 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56568 // -> `(icmp ult (add x, -C), 2)`
56569 // Likewise inverted cases with `ugt`.
56570 //
56571 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56572 // in worse codegen. So, undo the middle-end transform and go back to `(or
56573 // (icmp eq), (icmp eq))` form.
56574 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56575 // the xmm approach.
56576 //
56577 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56578 // ne))` as it doesn't end up instruction positive.
56579 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56580 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56581 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56582 !Subtarget.hasAVX512() &&
56583 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56584 Subtarget.hasAVX2()) &&
56585 LHS.hasOneUse()) {
56586
56587 APInt CmpC;
56588 SDValue AddC = LHS.getOperand(1);
56589 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56591 // See which form we have depending on the constant/condition.
56592 SDValue C0 = SDValue();
56593 SDValue C1 = SDValue();
56594
56595 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56596 // we will end up generating an additional constant. Keeping in the
56597 // current form has a slight latency cost, but it probably worth saving a
56598 // constant.
56601 // Pass
56602 }
56603 // Normal Cases
56604 else if ((CC == ISD::SETULT && CmpC == 2) ||
56605 (CC == ISD::SETULE && CmpC == 1)) {
56606 // These will constant fold.
56607 C0 = DAG.getNegative(AddC, DL, OpVT);
56608 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56609 DAG.getAllOnesConstant(DL, OpVT));
56610 }
56611 // Inverted Cases
56612 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56613 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56614 // These will constant fold.
56615 C0 = DAG.getNOT(DL, AddC, OpVT);
56616 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56617 DAG.getAllOnesConstant(DL, OpVT));
56618 }
56619 if (C0 && C1) {
56620 SDValue NewLHS =
56621 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56622 SDValue NewRHS =
56623 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56624 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56625 }
56626 }
56627 }
56628
56629 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56630 // to avoid scalarization via legalization because v4i32 is not a legal type.
56631 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56632 LHS.getValueType() == MVT::v4f32)
56633 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56634
56635 // X pred 0.0 --> X pred -X
56636 // If the negation of X already exists, use it in the comparison. This removes
56637 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56638 // instructions in patterns with a 'select' node.
56640 SDVTList FNegVT = DAG.getVTList(OpVT);
56641 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56642 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56643 }
56644
56645 return SDValue();
56646}
56647
56650 const X86Subtarget &Subtarget) {
56651 SDValue Src = N->getOperand(0);
56652 MVT SrcVT = Src.getSimpleValueType();
56653 MVT VT = N->getSimpleValueType(0);
56654 unsigned NumBits = VT.getScalarSizeInBits();
56655 unsigned NumElts = SrcVT.getVectorNumElements();
56656 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56657 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56658
56659 // Perform constant folding.
56660 APInt UndefElts;
56661 SmallVector<APInt, 32> EltBits;
56662 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56663 /*AllowWholeUndefs*/ true,
56664 /*AllowPartialUndefs*/ true)) {
56665 APInt Imm(32, 0);
56666 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56667 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56668 Imm.setBit(Idx);
56669
56670 return DAG.getConstant(Imm, SDLoc(N), VT);
56671 }
56672
56673 // Look through int->fp bitcasts that don't change the element width.
56674 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56675 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56676 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56677 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56678
56679 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56680 // with scalar comparisons.
56681 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56682 SDLoc DL(N);
56683 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56684 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56685 return DAG.getNode(ISD::XOR, DL, VT,
56686 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56687 DAG.getConstant(NotMask, DL, VT));
56688 }
56689
56690 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56691 // results with scalar comparisons.
56692 if (Src.getOpcode() == X86ISD::PCMPGT &&
56693 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56694 SDLoc DL(N);
56695 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56696 return DAG.getNode(ISD::XOR, DL, VT,
56697 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56698 DAG.getConstant(NotMask, DL, VT));
56699 }
56700
56701 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56702 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56703 // iff pow2splat(c1).
56704 // Use KnownBits to determine if only a single bit is non-zero
56705 // in each element (pow2 or zero), and shift that bit to the msb.
56706 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56707 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56708 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56709 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56710 if (KnownLHS.countMaxPopulation() == 1 &&
56711 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56712 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56713 SDLoc DL(N);
56714 MVT ShiftVT = SrcVT;
56715 SDValue ShiftLHS = Src.getOperand(0);
56716 SDValue ShiftRHS = Src.getOperand(1);
56717 if (ShiftVT.getScalarType() == MVT::i8) {
56718 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56719 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56720 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56721 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56722 }
56723 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56724 ShiftLHS, ShiftAmt, DAG);
56725 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56726 ShiftRHS, ShiftAmt, DAG);
56727 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56728 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56729 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56730 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56731 }
56732 }
56733
56734 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56735 if (N->isOnlyUserOf(Src.getNode())) {
56737 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56738 APInt UndefElts;
56739 SmallVector<APInt, 32> EltBits;
56740 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56741 UndefElts, EltBits)) {
56742 APInt Mask = APInt::getZero(NumBits);
56743 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56744 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56745 Mask.setBit(Idx);
56746 }
56747 SDLoc DL(N);
56748 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56749 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56750 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56751 DAG.getConstant(Mask, DL, VT));
56752 }
56753 }
56754 }
56755
56756 // Simplify the inputs.
56757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56758 APInt DemandedMask(APInt::getAllOnes(NumBits));
56759 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56760 return SDValue(N, 0);
56761
56762 return SDValue();
56763}
56764
56767 const X86Subtarget &Subtarget) {
56768 MVT VT = N->getSimpleValueType(0);
56769 unsigned NumBits = VT.getScalarSizeInBits();
56770
56771 // Simplify the inputs.
56772 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56773 APInt DemandedMask(APInt::getAllOnes(NumBits));
56774 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56775 return SDValue(N, 0);
56776
56777 return SDValue();
56778}
56779
56783 SDValue Mask = MemOp->getMask();
56784
56785 // With vector masks we only demand the upper bit of the mask.
56786 if (Mask.getScalarValueSizeInBits() != 1) {
56787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56788 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56789 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56790 if (N->getOpcode() != ISD::DELETED_NODE)
56791 DCI.AddToWorklist(N);
56792 return SDValue(N, 0);
56793 }
56794 }
56795
56796 return SDValue();
56797}
56798
56800 SDValue Index, SDValue Base, SDValue Scale,
56801 SelectionDAG &DAG) {
56802 SDLoc DL(GorS);
56803
56804 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56805 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56806 Gather->getMask(), Base, Index, Scale } ;
56807 return DAG.getMaskedGather(Gather->getVTList(),
56808 Gather->getMemoryVT(), DL, Ops,
56809 Gather->getMemOperand(),
56810 Gather->getIndexType(),
56811 Gather->getExtensionType());
56812 }
56813 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56814 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56815 Scatter->getMask(), Base, Index, Scale };
56816 return DAG.getMaskedScatter(Scatter->getVTList(),
56817 Scatter->getMemoryVT(), DL,
56818 Ops, Scatter->getMemOperand(),
56819 Scatter->getIndexType(),
56820 Scatter->isTruncatingStore());
56821}
56822
56825 SDLoc DL(N);
56826 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56827 SDValue Index = GorS->getIndex();
56828 SDValue Base = GorS->getBasePtr();
56829 SDValue Scale = GorS->getScale();
56830 EVT IndexVT = Index.getValueType();
56831 EVT IndexSVT = IndexVT.getVectorElementType();
56832 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56833 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56834 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56835
56836 if (DCI.isBeforeLegalize()) {
56837 // Attempt to move shifted index into the address scale, allows further
56838 // index truncation below.
56839 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56840 isa<ConstantSDNode>(Scale)) {
56841 unsigned ScaleAmt = Scale->getAsZExtVal();
56842 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56843 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56844 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56845 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56846 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56847 if (N->getOpcode() != ISD::DELETED_NODE)
56848 DCI.AddToWorklist(N);
56849 return SDValue(N, 0);
56850 }
56851 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56852 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56853 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56854 SDValue ShAmt = Index.getOperand(1);
56855 SDValue NewShAmt =
56856 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56857 DAG.getConstant(1, DL, ShAmt.getValueType()));
56858 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56859 Index.getOperand(0), NewShAmt);
56860 SDValue NewScale =
56861 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56862 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56863 }
56864 }
56865 }
56866
56867 // Shrink indices if they are larger than 32-bits.
56868 // Only do this before legalize types since v2i64 could become v2i32.
56869 // FIXME: We could check that the type is legal if we're after legalize
56870 // types, but then we would need to construct test cases where that happens.
56871 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56872 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56873
56874 // FIXME: We could support more than just constant fold, but we need to
56875 // careful with costing. A truncate that can be optimized out would be
56876 // fine. Otherwise we might only want to create a truncate if it avoids
56877 // a split.
56878 if (SDValue TruncIndex =
56879 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56880 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56881
56882 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56883 // there are sufficient sign bits. Only do this before legalize types to
56884 // avoid creating illegal types in truncate.
56885 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56886 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56887 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56888 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56889 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56890 }
56891
56892 // Shrink if we remove an illegal type.
56893 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56894 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56895 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56896 }
56897 }
56898 }
56899
56900 // Try to move splat adders from the index operand to the base
56901 // pointer operand. Taking care to multiply by the scale. We can only do
56902 // this when index element type is the same as the pointer type.
56903 // Otherwise we need to be sure the math doesn't wrap before the scale.
56904 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56905 isa<ConstantSDNode>(Scale)) {
56906 uint64_t ScaleAmt = Scale->getAsZExtVal();
56907
56908 for (unsigned I = 0; I != 2; ++I)
56909 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56910 BitVector UndefElts;
56911 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56912 if (UndefElts.none()) {
56913 // If the splat value is constant we can add the scaled splat value
56914 // to the existing base.
56915 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56916 APInt Adder = C->getAPIntValue() * ScaleAmt;
56917 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56918 DAG.getConstant(Adder, DL, PtrVT));
56919 SDValue NewIndex = Index.getOperand(1 - I);
56920 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56921 }
56922 // For non-constant cases, limit this to non-scaled cases.
56923 if (ScaleAmt == 1) {
56924 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56925 SDValue NewIndex = Index.getOperand(1 - I);
56926 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56927 }
56928 }
56929 }
56930 // It's also possible base is just a constant. In that case, just
56931 // replace it with 0 and move the displacement into the index.
56932 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
56933 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
56934 // Combine the constant build_vector and the constant base.
56935 Splat =
56936 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
56937 // Add to the other half of the original Index add.
56938 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
56939 Index.getOperand(1 - I), Splat);
56940 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
56941 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56942 }
56943 }
56944 }
56945
56946 if (DCI.isBeforeLegalizeOps()) {
56947 // Make sure the index is either i32 or i64
56948 if (IndexWidth != 32 && IndexWidth != 64) {
56949 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
56950 IndexVT = IndexVT.changeVectorElementType(EltVT);
56951 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
56952 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56953 }
56954 }
56955
56956 // With vector masks we only demand the upper bit of the mask.
56957 SDValue Mask = GorS->getMask();
56958 if (Mask.getScalarValueSizeInBits() != 1) {
56959 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56960 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56961 if (N->getOpcode() != ISD::DELETED_NODE)
56962 DCI.AddToWorklist(N);
56963 return SDValue(N, 0);
56964 }
56965 }
56966
56967 return SDValue();
56968}
56969
56970// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
56972 const X86Subtarget &Subtarget) {
56973 SDLoc DL(N);
56974 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
56975 SDValue EFLAGS = N->getOperand(1);
56976
56977 // Try to simplify the EFLAGS and condition code operands.
56978 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
56979 return getSETCC(CC, Flags, DL, DAG);
56980
56981 return SDValue();
56982}
56983
56984/// Optimize branch condition evaluation.
56986 const X86Subtarget &Subtarget) {
56987 SDLoc DL(N);
56988 SDValue EFLAGS = N->getOperand(3);
56989 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
56990
56991 // Try to simplify the EFLAGS and condition code operands.
56992 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
56993 // RAUW them under us.
56994 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
56995 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
56996 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
56997 N->getOperand(1), Cond, Flags);
56998 }
56999
57000 return SDValue();
57001}
57002
57003// TODO: Could we move this to DAGCombine?
57005 SelectionDAG &DAG) {
57006 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57007 // to optimize away operation when it's from a constant.
57008 //
57009 // The general transformation is:
57010 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57011 // AND(VECTOR_CMP(x,y), constant2)
57012 // constant2 = UNARYOP(constant)
57013
57014 // Early exit if this isn't a vector operation, the operand of the
57015 // unary operation isn't a bitwise AND, or if the sizes of the operations
57016 // aren't the same.
57017 EVT VT = N->getValueType(0);
57018 bool IsStrict = N->isStrictFPOpcode();
57019 unsigned NumEltBits = VT.getScalarSizeInBits();
57020 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57021 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57022 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57023 VT.getSizeInBits() != Op0.getValueSizeInBits())
57024 return SDValue();
57025
57026 // Now check that the other operand of the AND is a constant. We could
57027 // make the transformation for non-constant splats as well, but it's unclear
57028 // that would be a benefit as it would not eliminate any operations, just
57029 // perform one more step in scalar code before moving to the vector unit.
57030 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57031 // Bail out if the vector isn't a constant.
57032 if (!BV->isConstant())
57033 return SDValue();
57034
57035 // Everything checks out. Build up the new and improved node.
57036 SDLoc DL(N);
57037 EVT IntVT = BV->getValueType(0);
57038 // Create a new constant of the appropriate type for the transformed
57039 // DAG.
57040 SDValue SourceConst;
57041 if (IsStrict)
57042 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57043 {N->getOperand(0), SDValue(BV, 0)});
57044 else
57045 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57046 // The AND node needs bitcasts to/from an integer vector type around it.
57047 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57048 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57049 MaskConst);
57050 SDValue Res = DAG.getBitcast(VT, NewAnd);
57051 if (IsStrict)
57052 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57053 return Res;
57054 }
57055
57056 return SDValue();
57057}
57058
57059/// If we are converting a value to floating-point, try to replace scalar
57060/// truncate of an extracted vector element with a bitcast. This tries to keep
57061/// the sequence on XMM registers rather than moving between vector and GPRs.
57063 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57064 // to allow being called by any similar cast opcode.
57065 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57066 SDValue Trunc = N->getOperand(0);
57067 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57068 return SDValue();
57069
57070 SDValue ExtElt = Trunc.getOperand(0);
57071 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57072 !isNullConstant(ExtElt.getOperand(1)))
57073 return SDValue();
57074
57075 EVT TruncVT = Trunc.getValueType();
57076 EVT SrcVT = ExtElt.getValueType();
57077 unsigned DestWidth = TruncVT.getSizeInBits();
57078 unsigned SrcWidth = SrcVT.getSizeInBits();
57079 if (SrcWidth % DestWidth != 0)
57080 return SDValue();
57081
57082 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57083 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57084 unsigned VecWidth = SrcVecVT.getSizeInBits();
57085 unsigned NumElts = VecWidth / DestWidth;
57086 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57087 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57088 SDLoc DL(N);
57089 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57090 BitcastVec, ExtElt.getOperand(1));
57091 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57092}
57093
57095 const X86Subtarget &Subtarget) {
57096 bool IsStrict = N->isStrictFPOpcode();
57097 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57098 EVT VT = N->getValueType(0);
57099 EVT InVT = Op0.getValueType();
57100
57101 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57102 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57103 // if hasFP16 support:
57104 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57105 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57106 // else
57107 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57108 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57109 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57110 unsigned ScalarSize = InVT.getScalarSizeInBits();
57111 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57112 ScalarSize >= 64)
57113 return SDValue();
57114 SDLoc dl(N);
57115 EVT DstVT =
57117 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57118 : ScalarSize < 32 ? MVT::i32
57119 : MVT::i64,
57120 InVT.getVectorNumElements());
57121 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57122 if (IsStrict)
57123 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57124 {N->getOperand(0), P});
57125 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57126 }
57127
57128 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57129 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57130 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57131 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57132 VT.getScalarType() != MVT::f16) {
57133 SDLoc dl(N);
57134 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57135 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57136
57137 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57138 if (IsStrict)
57139 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57140 {N->getOperand(0), P});
57141 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57142 }
57143
57144 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57145 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57146 // the optimization here.
57147 SDNodeFlags Flags = N->getFlags();
57148 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57149 if (IsStrict)
57150 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57151 {N->getOperand(0), Op0});
57152 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57153 }
57154
57155 return SDValue();
57156}
57157
57160 const X86Subtarget &Subtarget) {
57161 // First try to optimize away the conversion entirely when it's
57162 // conditionally from a constant. Vectors only.
57163 bool IsStrict = N->isStrictFPOpcode();
57165 return Res;
57166
57167 // Now move on to more general possibilities.
57168 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57169 EVT VT = N->getValueType(0);
57170 EVT InVT = Op0.getValueType();
57171
57172 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57173 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57174 // if hasFP16 support:
57175 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57176 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57177 // else
57178 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57179 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57180 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57181 unsigned ScalarSize = InVT.getScalarSizeInBits();
57182 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57183 ScalarSize >= 64)
57184 return SDValue();
57185 SDLoc dl(N);
57186 EVT DstVT =
57188 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57189 : ScalarSize < 32 ? MVT::i32
57190 : MVT::i64,
57191 InVT.getVectorNumElements());
57192 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57193 if (IsStrict)
57194 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57195 {N->getOperand(0), P});
57196 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57197 }
57198
57199 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57200 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57201 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57202 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57203 VT.getScalarType() != MVT::f16) {
57204 SDLoc dl(N);
57205 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57206 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57207 if (IsStrict)
57208 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57209 {N->getOperand(0), P});
57210 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57211 }
57212
57213 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57214 // vectors and scalars, see if we know that the upper bits are all the sign
57215 // bit, in which case we can truncate the input to i32 and convert from that.
57216 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57217 unsigned BitWidth = InVT.getScalarSizeInBits();
57218 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57219 if (NumSignBits >= (BitWidth - 31)) {
57220 EVT TruncVT = MVT::i32;
57221 if (InVT.isVector())
57222 TruncVT = InVT.changeVectorElementType(TruncVT);
57223 SDLoc dl(N);
57224 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57225 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57226 if (IsStrict)
57227 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57228 {N->getOperand(0), Trunc});
57229 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57230 }
57231 // If we're after legalize and the type is v2i32 we need to shuffle and
57232 // use CVTSI2P.
57233 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57234 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57235 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57236 { 0, 2, -1, -1 });
57237 if (IsStrict)
57238 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57239 {N->getOperand(0), Shuf});
57240 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57241 }
57242 }
57243
57244 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57245 // a 32-bit target where SSE doesn't support i64->FP operations.
57246 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57247 Op0.getOpcode() == ISD::LOAD) {
57248 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57249
57250 // This transformation is not supported if the result type is f16 or f128.
57251 if (VT == MVT::f16 || VT == MVT::f128)
57252 return SDValue();
57253
57254 // If we have AVX512DQ we can use packed conversion instructions unless
57255 // the VT is f80.
57256 if (Subtarget.hasDQI() && VT != MVT::f80)
57257 return SDValue();
57258
57259 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57260 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57261 std::pair<SDValue, SDValue> Tmp =
57262 Subtarget.getTargetLowering()->BuildFILD(
57263 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57264 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57265 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57266 return Tmp.first;
57267 }
57268 }
57269
57270 if (IsStrict)
57271 return SDValue();
57272
57273 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57274 return V;
57275
57276 return SDValue();
57277}
57278
57280 const X86Subtarget &Subtarget) {
57281 EVT VT = N->getValueType(0);
57282 SDValue Src = N->getOperand(0);
57283 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57284 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57285 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57286
57287 return SDValue();
57288}
57289
57290// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57292 const X86Subtarget &Subtarget) {
57293 if (!Subtarget.hasAVX10_2())
57294 return SDValue();
57295
57296 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57297 EVT SrcVT = N->getOperand(0).getValueType();
57298 EVT DstVT = N->getValueType(0);
57299 SDLoc dl(N);
57300
57301 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57302 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57303
57304 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57305 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57306 N->getOperand(0), V2F32Value);
57307
57308 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57309 if (IsSigned)
57310 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57311
57312 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57313 }
57314 return SDValue();
57315}
57316
57318 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57319
57320 for (const SDNode *User : Flags->users()) {
57321 X86::CondCode CC;
57322 switch (User->getOpcode()) {
57323 default:
57324 // Be conservative.
57325 return true;
57326 case X86ISD::SETCC:
57328 CC = (X86::CondCode)User->getConstantOperandVal(0);
57329 break;
57330 case X86ISD::BRCOND:
57331 case X86ISD::CMOV:
57332 CC = (X86::CondCode)User->getConstantOperandVal(2);
57333 break;
57334 }
57335
57336 switch (CC) {
57337 // clang-format off
57338 default: break;
57339 case X86::COND_A: case X86::COND_AE:
57340 case X86::COND_B: case X86::COND_BE:
57341 case X86::COND_O: case X86::COND_NO:
57342 case X86::COND_G: case X86::COND_GE:
57343 case X86::COND_L: case X86::COND_LE:
57344 return true;
57345 // clang-format on
57346 }
57347 }
57348
57349 return false;
57350}
57351
57352static bool onlyZeroFlagUsed(SDValue Flags) {
57353 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57354
57355 for (const SDNode *User : Flags->users()) {
57356 unsigned CCOpNo;
57357 switch (User->getOpcode()) {
57358 default:
57359 // Be conservative.
57360 return false;
57361 case X86ISD::SETCC:
57363 CCOpNo = 0;
57364 break;
57365 case X86ISD::BRCOND:
57366 case X86ISD::CMOV:
57367 CCOpNo = 2;
57368 break;
57369 }
57370
57371 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57372 if (CC != X86::COND_E && CC != X86::COND_NE)
57373 return false;
57374 }
57375
57376 return true;
57377}
57378
57381 const X86Subtarget &Subtarget) {
57382 // Only handle test patterns.
57383 if (!isNullConstant(N->getOperand(1)))
57384 return SDValue();
57385
57386 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57387 // and use its flags directly.
57388 // TODO: Maybe we should try promoting compares that only use the zero flag
57389 // first if we can prove the upper bits with computeKnownBits?
57390 SDLoc dl(N);
57391 SDValue Op = N->getOperand(0);
57392 EVT VT = Op.getValueType();
57393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57394
57395 if (SDValue CMP =
57396 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57397 return CMP;
57398
57399 // If we have a constant logical shift that's only used in a comparison
57400 // against zero turn it into an equivalent AND. This allows turning it into
57401 // a TEST instruction later.
57402 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57403 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57404 onlyZeroFlagUsed(SDValue(N, 0))) {
57405 unsigned BitWidth = VT.getSizeInBits();
57406 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57407 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57408 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57409 APInt Mask = Op.getOpcode() == ISD::SRL
57410 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57411 : APInt::getLowBitsSet(BitWidth, MaskBits);
57412 if (Mask.isSignedIntN(32)) {
57413 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57414 DAG.getConstant(Mask, dl, VT));
57415 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57416 DAG.getConstant(0, dl, VT));
57417 }
57418 }
57419 }
57420
57421 // If we're extracting from a avx512 bool vector and comparing against zero,
57422 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57423 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57424 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57425 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57426 SDValue Src = Op.getOperand(0);
57427 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57428 isNullConstant(Src.getOperand(1)) &&
57429 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57430 SDValue BoolVec = Src.getOperand(0);
57431 unsigned ShAmt = 0;
57432 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57433 ShAmt = BoolVec.getConstantOperandVal(1);
57434 BoolVec = BoolVec.getOperand(0);
57435 }
57436 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57437 EVT VecVT = BoolVec.getValueType();
57438 unsigned BitWidth = VecVT.getVectorNumElements();
57439 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57440 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57441 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57442 Op = DAG.getBitcast(BCVT, BoolVec);
57443 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57444 DAG.getConstant(Mask, dl, BCVT));
57445 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57446 DAG.getConstant(0, dl, BCVT));
57447 }
57448 }
57449 }
57450
57451 // Peek through any zero-extend if we're only testing for a zero result.
57452 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57453 SDValue Src = Op.getOperand(0);
57454 EVT SrcVT = Src.getValueType();
57455 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57456 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57457 DAG.getConstant(0, dl, SrcVT));
57458 }
57459
57460 // Look for a truncate.
57461 if (Op.getOpcode() != ISD::TRUNCATE)
57462 return SDValue();
57463
57464 SDValue Trunc = Op;
57465 Op = Op.getOperand(0);
57466
57467 // See if we can compare with zero against the truncation source,
57468 // which should help using the Z flag from many ops. Only do this for
57469 // i32 truncated op to prevent partial-reg compares of promoted ops.
57470 EVT OpVT = Op.getValueType();
57471 APInt UpperBits =
57473 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57474 onlyZeroFlagUsed(SDValue(N, 0))) {
57475 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57476 DAG.getConstant(0, dl, OpVT));
57477 }
57478
57479 // After this the truncate and arithmetic op must have a single use.
57480 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57481 return SDValue();
57482
57483 unsigned NewOpc;
57484 switch (Op.getOpcode()) {
57485 default: return SDValue();
57486 case ISD::AND:
57487 // Skip and with constant. We have special handling for and with immediate
57488 // during isel to generate test instructions.
57489 if (isa<ConstantSDNode>(Op.getOperand(1)))
57490 return SDValue();
57491 NewOpc = X86ISD::AND;
57492 break;
57493 case ISD::OR: NewOpc = X86ISD::OR; break;
57494 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57495 case ISD::ADD:
57496 // If the carry or overflow flag is used, we can't truncate.
57498 return SDValue();
57499 NewOpc = X86ISD::ADD;
57500 break;
57501 case ISD::SUB:
57502 // If the carry or overflow flag is used, we can't truncate.
57504 return SDValue();
57505 NewOpc = X86ISD::SUB;
57506 break;
57507 }
57508
57509 // We found an op we can narrow. Truncate its inputs.
57510 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57511 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57512
57513 // Use a X86 specific opcode to avoid DAG combine messing with it.
57514 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57515 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57516
57517 // For AND, keep a CMP so that we can match the test pattern.
57518 if (NewOpc == X86ISD::AND)
57519 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57520 DAG.getConstant(0, dl, VT));
57521
57522 // Return the flags.
57523 return Op.getValue(1);
57524}
57525
57528 const X86Subtarget &ST) {
57529 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57530 "Expected X86ISD::ADD or X86ISD::SUB");
57531
57532 SDLoc DL(N);
57533 SDValue LHS = N->getOperand(0);
57534 SDValue RHS = N->getOperand(1);
57535 MVT VT = LHS.getSimpleValueType();
57536 bool IsSub = X86ISD::SUB == N->getOpcode();
57537 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57538
57539 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57540 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57541 return CMP;
57542
57543 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57544 if (!N->hasAnyUseOfValue(1)) {
57545 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57546 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57547 }
57548
57549 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57550 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57551 SDValue Ops[] = {N0, N1};
57552 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57553 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57554 SDValue Op(N, 0);
57555 if (Negate) {
57556 // Bail if this is only used by a user of the x86 add/sub.
57557 if (GenericAddSub->hasOneUse() &&
57558 GenericAddSub->user_begin()->isOnlyUserOf(N))
57559 return;
57560 Op = DAG.getNegative(Op, DL, VT);
57561 }
57562 DCI.CombineTo(GenericAddSub, Op);
57563 }
57564 };
57565 MatchGeneric(LHS, RHS, false);
57566 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57567
57568 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57569 // EFLAGS result doesn't change.
57570 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57571 /*ZeroSecondOpOnly*/ true);
57572}
57573
57575 SDValue LHS = N->getOperand(0);
57576 SDValue RHS = N->getOperand(1);
57577 SDValue BorrowIn = N->getOperand(2);
57578
57579 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57580 MVT VT = N->getSimpleValueType(0);
57581 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57582 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57583 }
57584
57585 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57586 // iff the flag result is dead.
57587 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57588 !N->hasAnyUseOfValue(1))
57589 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57590 LHS.getOperand(1), BorrowIn);
57591
57592 return SDValue();
57593}
57594
57595// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57598 SDValue LHS = N->getOperand(0);
57599 SDValue RHS = N->getOperand(1);
57600 SDValue CarryIn = N->getOperand(2);
57601 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57602 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57603
57604 // Canonicalize constant to RHS.
57605 if (LHSC && !RHSC)
57606 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57607 CarryIn);
57608
57609 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57610 // the result is either zero or one (depending on the input carry bit).
57611 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57612 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57613 // We don't have a good way to replace an EFLAGS use, so only do this when
57614 // dead right now.
57615 SDValue(N, 1).use_empty()) {
57616 SDLoc DL(N);
57617 EVT VT = N->getValueType(0);
57618 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57619 SDValue Res1 = DAG.getNode(
57620 ISD::AND, DL, VT,
57622 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57623 DAG.getConstant(1, DL, VT));
57624 return DCI.CombineTo(N, Res1, CarryOut);
57625 }
57626
57627 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57628 // iff the flag result is dead.
57629 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57630 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57631 SDLoc DL(N);
57632 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57633 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57634 DAG.getConstant(0, DL, LHS.getValueType()),
57635 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57636 }
57637
57638 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57639 MVT VT = N->getSimpleValueType(0);
57640 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57641 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57642 }
57643
57644 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57645 // iff the flag result is dead.
57646 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57647 !N->hasAnyUseOfValue(1))
57648 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57649 LHS.getOperand(1), CarryIn);
57650
57651 return SDValue();
57652}
57653
57655 const SDLoc &DL, EVT VT,
57656 const X86Subtarget &Subtarget) {
57657 using namespace SDPatternMatch;
57658
57659 // Example of pattern we try to detect:
57660 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57661 //(add (build_vector (extract_elt t, 0),
57662 // (extract_elt t, 2),
57663 // (extract_elt t, 4),
57664 // (extract_elt t, 6)),
57665 // (build_vector (extract_elt t, 1),
57666 // (extract_elt t, 3),
57667 // (extract_elt t, 5),
57668 // (extract_elt t, 7)))
57669
57670 if (!Subtarget.hasSSE2())
57671 return SDValue();
57672
57673 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57674 VT.getVectorNumElements() < 4 ||
57676 return SDValue();
57677
57678 SDValue Op0, Op1, Accum;
57683 m_Value(Op1))))))
57684 return SDValue();
57685
57686 // Check if one of Op0,Op1 is of the form:
57687 // (build_vector (extract_elt Mul, 0),
57688 // (extract_elt Mul, 2),
57689 // (extract_elt Mul, 4),
57690 // ...
57691 // the other is of the form:
57692 // (build_vector (extract_elt Mul, 1),
57693 // (extract_elt Mul, 3),
57694 // (extract_elt Mul, 5),
57695 // ...
57696 // and identify Mul.
57697 SDValue Mul;
57698 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57699 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57700 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57701 // TODO: Be more tolerant to undefs.
57702 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57703 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57704 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57705 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57706 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57707 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57708 return SDValue();
57709 // Commutativity of mul allows factors of a product to reorder.
57710 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57711 std::swap(Idx0L, Idx1L);
57712 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57713 std::swap(Idx0H, Idx1H);
57714 // Commutativity of add allows pairs of factors to reorder.
57715 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57716 std::swap(Idx0L, Idx0H);
57717 std::swap(Idx1L, Idx1H);
57718 }
57719 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57720 Idx1H != 2 * i + 3)
57721 return SDValue();
57722 if (!Mul) {
57723 // First time an extract_elt's source vector is visited. Must be a MUL
57724 // with 2X number of vector elements than the BUILD_VECTOR.
57725 // Both extracts must be from same MUL.
57726 Mul = Vec0L;
57727 if (Mul.getOpcode() != ISD::MUL ||
57728 Mul.getValueType().getVectorNumElements() != 2 * e)
57729 return SDValue();
57730 }
57731 // Check that the extract is from the same MUL previously seen.
57732 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57733 return SDValue();
57734 }
57735
57736 // Check if the Mul source can be safely shrunk.
57738 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57740 return SDValue();
57741
57742 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57743 VT.getVectorNumElements() * 2);
57744 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57745 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57746
57747 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57749 EVT InVT = Ops[0].getValueType();
57750 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57751 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57752 InVT.getVectorNumElements() / 2);
57753 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57754 };
57755 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57756 if (Accum)
57757 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57758 return R;
57759}
57760
57761// Attempt to turn this pattern into PMADDWD.
57762// (add (mul (sext (build_vector)), (sext (build_vector))),
57763// (mul (sext (build_vector)), (sext (build_vector)))
57765 const SDLoc &DL, EVT VT,
57766 const X86Subtarget &Subtarget) {
57767 using namespace SDPatternMatch;
57768
57769 if (!Subtarget.hasSSE2())
57770 return SDValue();
57771
57772 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57773 VT.getVectorNumElements() < 4 ||
57775 return SDValue();
57776
57777 // All inputs need to be sign extends.
57778 // TODO: Support ZERO_EXTEND from known positive?
57779 SDValue N00, N01, N10, N11;
57780 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57781 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57782 return SDValue();
57783
57784 // Must be extending from vXi16.
57785 EVT InVT = N00.getValueType();
57786 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57787 N10.getValueType() != InVT || N11.getValueType() != InVT)
57788 return SDValue();
57789
57790 // All inputs should be build_vectors.
57791 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57792 N01.getOpcode() != ISD::BUILD_VECTOR ||
57793 N10.getOpcode() != ISD::BUILD_VECTOR ||
57795 return SDValue();
57796
57797 // For each element, we need to ensure we have an odd element from one vector
57798 // multiplied by the odd element of another vector and the even element from
57799 // one of the same vectors being multiplied by the even element from the
57800 // other vector. So we need to make sure for each element i, this operator
57801 // is being performed:
57802 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57803 SDValue In0, In1;
57804 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57805 SDValue N00Elt = N00.getOperand(i);
57806 SDValue N01Elt = N01.getOperand(i);
57807 SDValue N10Elt = N10.getOperand(i);
57808 SDValue N11Elt = N11.getOperand(i);
57809 // TODO: Be more tolerant to undefs.
57810 SDValue N00In, N01In, N10In, N11In;
57811 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57812 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57813 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57814 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57815 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57816 return SDValue();
57817 // Add is commutative so indices can be reordered.
57818 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57819 std::swap(IdxN00, IdxN10);
57820 std::swap(IdxN01, IdxN11);
57821 }
57822 // N0 indices be the even element. N1 indices must be the next odd element.
57823 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57824 IdxN11 != 2 * i + 1)
57825 return SDValue();
57826
57827 // First time we find an input capture it.
57828 if (!In0) {
57829 In0 = N00In;
57830 In1 = N01In;
57831
57832 // The input vectors must be at least as wide as the output.
57833 // If they are larger than the output, we extract subvector below.
57834 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57835 In1.getValueSizeInBits() < VT.getSizeInBits())
57836 return SDValue();
57837 }
57838 // Mul is commutative so the input vectors can be in any order.
57839 // Canonicalize to make the compares easier.
57840 if (In0 != N00In)
57841 std::swap(N00In, N01In);
57842 if (In0 != N10In)
57843 std::swap(N10In, N11In);
57844 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57845 return SDValue();
57846 }
57847
57848 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57850 EVT OpVT = Ops[0].getValueType();
57851 assert(OpVT.getScalarType() == MVT::i16 &&
57852 "Unexpected scalar element type");
57853 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57854 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57855 OpVT.getVectorNumElements() / 2);
57856 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57857 };
57858
57859 // If the output is narrower than an input, extract the low part of the input
57860 // vector.
57861 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57862 VT.getVectorNumElements() * 2);
57863 if (OutVT16.bitsLT(In0.getValueType())) {
57864 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57865 DAG.getVectorIdxConstant(0, DL));
57866 }
57867 if (OutVT16.bitsLT(In1.getValueType())) {
57868 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57869 DAG.getVectorIdxConstant(0, DL));
57870 }
57871 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57872 PMADDBuilder);
57873}
57874
57875// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57876// If upper element in each pair of both VPMADDWD are zero then we can merge
57877// the operand elements and use the implicit add of VPMADDWD.
57878// TODO: Add support for VPMADDUBSW (which isn't commutable).
57880 const SDLoc &DL, EVT VT) {
57881 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57882 return SDValue();
57883
57884 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57885 if (VT.getSizeInBits() > 128)
57886 return SDValue();
57887
57888 unsigned NumElts = VT.getVectorNumElements();
57889 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57891 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57892
57893 bool Op0HiZero =
57894 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57895 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57896 bool Op1HiZero =
57897 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57898 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57899
57900 // TODO: Check for zero lower elements once we have actual codegen that
57901 // creates them.
57902 if (!Op0HiZero || !Op1HiZero)
57903 return SDValue();
57904
57905 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57906 SmallVector<int> Mask;
57907 for (int i = 0; i != (int)NumElts; ++i) {
57908 Mask.push_back(2 * i);
57909 Mask.push_back(2 * (i + NumElts));
57910 }
57911
57912 SDValue LHS =
57913 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57914 SDValue RHS =
57915 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57916 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57917}
57918
57919/// CMOV of constants requires materializing constant operands in registers.
57920/// Try to fold those constants into an 'add' instruction to reduce instruction
57921/// count. We do this with CMOV rather the generic 'select' because there are
57922/// earlier folds that may be used to turn select-of-constants into logic hacks.
57924 SelectionDAG &DAG,
57925 const X86Subtarget &Subtarget) {
57926 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57927 // better because we eliminate 1-2 instructions. This transform is still
57928 // an improvement without zero operands because we trade 2 move constants and
57929 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57930 // immediate asm operands (fit in 32-bits).
57931 auto isSuitableCmov = [](SDValue V) {
57932 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
57933 return false;
57934 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
57935 !isa<ConstantSDNode>(V.getOperand(1)))
57936 return false;
57937 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
57938 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
57939 V.getConstantOperandAPInt(1).isSignedIntN(32));
57940 };
57941
57942 // Match an appropriate CMOV as the first operand of the add.
57943 SDValue Cmov = N->getOperand(0);
57944 SDValue OtherOp = N->getOperand(1);
57945 if (!isSuitableCmov(Cmov))
57946 std::swap(Cmov, OtherOp);
57947 if (!isSuitableCmov(Cmov))
57948 return SDValue();
57949
57950 // Don't remove a load folding opportunity for the add. That would neutralize
57951 // any improvements from removing constant materializations.
57952 if (X86::mayFoldLoad(OtherOp, Subtarget))
57953 return SDValue();
57954
57955 EVT VT = N->getValueType(0);
57956 SDValue FalseOp = Cmov.getOperand(0);
57957 SDValue TrueOp = Cmov.getOperand(1);
57958
57959 // We will push the add through the select, but we can potentially do better
57960 // if we know there is another add in the sequence and this is pointer math.
57961 // In that case, we can absorb an add into the trailing memory op and avoid
57962 // a 3-operand LEA which is likely slower than a 2-operand LEA.
57963 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
57964 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
57965 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
57966 all_of(N->users(), [&](SDNode *Use) {
57967 auto *MemNode = dyn_cast<MemSDNode>(Use);
57968 return MemNode && MemNode->getBasePtr().getNode() == N;
57969 })) {
57970 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
57971 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
57972 // it is possible that choosing op1 might be better.
57973 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
57974 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
57975 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
57976 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
57977 Cmov.getOperand(2), Cmov.getOperand(3));
57978 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
57979 }
57980
57981 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
57982 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
57983 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
57984 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
57985 Cmov.getOperand(3));
57986}
57987
57988// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
57989// When upper 12 bits of x, y and MUL(x, y) are known to be 0
57991 EVT VT, const X86Subtarget &Subtarget) {
57992 using namespace SDPatternMatch;
57993 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
57994 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
57995 return SDValue();
57996
57997 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
57998 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
57999 VT.getSizeInBits() < 512)
58000 return SDValue();
58001
58002 const auto TotalSize = VT.getSizeInBits();
58003 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58004 return SDValue();
58005
58006 SDValue X, Y, Acc;
58007 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58008 return SDValue();
58009
58010 KnownBits KnownX = DAG.computeKnownBits(X);
58011 if (KnownX.countMinLeadingZeros() < 12)
58012 return SDValue();
58013 KnownBits KnownY = DAG.computeKnownBits(Y);
58014 if (KnownY.countMinLeadingZeros() < 12)
58015 return SDValue();
58016 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58017 if (KnownMul.countMinLeadingZeros() < 12)
58018 return SDValue();
58019
58020 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58021 ArrayRef<SDValue> SubOps) {
58022 EVT SubVT = SubOps[0].getValueType();
58023 assert(SubVT.getScalarSizeInBits() == 64 &&
58024 "Unexpected element size, only supports 64bit size");
58025 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58026 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58027 };
58028
58029 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58030 /*CheckBWI*/ false);
58031}
58032
58035 const X86Subtarget &Subtarget) {
58036 using namespace SDPatternMatch;
58037 EVT VT = N->getValueType(0);
58038 SDValue Op0 = N->getOperand(0);
58039 SDValue Op1 = N->getOperand(1);
58040 SDLoc DL(N);
58041
58042 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58043 return Select;
58044
58045 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58046 return MAdd;
58047 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58048 return MAdd;
58049 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58050 return MAdd;
58051
58052 // Try to synthesize horizontal adds from adds of shuffles.
58053 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58054 return V;
58055
58056 // Canonicalize hidden LEA pattern:
58057 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58058 // iff c < 4
58059 if (VT == MVT::i32 || VT == MVT::i64) {
58060 SDValue Y, Z, Shift;
58061 APInt Amt;
58062 if (sd_match(
58064 m_Shl(m_Value(), m_ConstInt(Amt))),
58065 m_Value(Y))),
58066 m_Value(Z))) &&
58067 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58068 return DAG.getNode(ISD::SUB, DL, VT,
58069 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58070 }
58071 }
58072
58073 SDValue X, Y;
58074
58075 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58076 // iff X and Y won't overflow.
58077 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58079 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58080 MVT OpVT = X.getSimpleValueType();
58081 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58082 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58083 getZeroVector(OpVT, Subtarget, DAG, DL));
58084 }
58085
58086 if (VT.isVector()) {
58087 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58089
58090 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58091 // (sub Y, (sext (vXi1 X))).
58092 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58093 // in generic DAG combine without a legal type check, but adding this there
58094 // caused regressions.
58095 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58097 m_Value(Y)))) {
58098 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58099 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58100 }
58101
58102 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58103 // canonicalisation as we don't have good vXi8 shifts.
58104 if (VT.getScalarType() == MVT::i8 &&
58106 SDValue Cmp =
58107 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58108 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58109 }
58110 }
58111
58112 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58113 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58114 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58115 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58116 if (sd_match(N, m_Add(m_Value(Accum),
58119 m_Value(Lo1)),
58121 m_Value(Hi1)))))) {
58122 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58123 concatSubVectors(Lo0, Hi0, DAG, DL),
58124 concatSubVectors(Lo1, Hi1, DAG, DL));
58125 }
58126 }
58127
58128 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58129 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58130 X86::isZeroNode(Op0.getOperand(1))) {
58131 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58132 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58133 Op0.getOperand(0), Op0.getOperand(2));
58134 }
58135
58136 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58137 return IFMA52;
58138
58139 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58140}
58141
58142// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58143// condition comes from the subtract node that produced -X. This matches the
58144// cmov expansion for absolute value. By swapping the operands we convert abs
58145// to nabs.
58146static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58147 SelectionDAG &DAG) {
58148 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58149 return SDValue();
58150
58151 SDValue Cond = N1.getOperand(3);
58152 if (Cond.getOpcode() != X86ISD::SUB)
58153 return SDValue();
58154 assert(Cond.getResNo() == 1 && "Unexpected result number");
58155
58156 SDValue FalseOp = N1.getOperand(0);
58157 SDValue TrueOp = N1.getOperand(1);
58159
58160 // ABS condition should come from a negate operation.
58161 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58162 isNullConstant(Cond.getOperand(0))) {
58163 // Get the X and -X from the negate.
58164 SDValue NegX = Cond.getValue(0);
58165 SDValue X = Cond.getOperand(1);
58166
58167 // Cmov operands should be X and NegX. Order doesn't matter.
58168 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58169 return SDValue();
58170
58171 // Build a new CMOV with the operands swapped.
58172 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58173 N1.getOperand(2), Cond);
58174 // Convert sub to add.
58175 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58176 }
58177
58178 // Handle ABD special case:
58179 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58180 // ABD condition should come from a pair of matching subtracts.
58181 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58182 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58183 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58184 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58185 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58186 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58187 // Build a new CMOV with the operands swapped.
58188 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58189 Cond);
58190 }
58191
58192 return SDValue();
58193}
58194
58196 SDValue Op0 = N->getOperand(0);
58197 SDValue Op1 = N->getOperand(1);
58198
58199 // (sub C (zero_extend (setcc)))
58200 // =>
58201 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58202 // Don't disturb (sub 0 setcc), which is easily done with neg.
58203 EVT VT = N->getValueType(0);
58204 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58205 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58206 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58207 Op1.getOperand(0).hasOneUse()) {
58208 SDValue SetCC = Op1.getOperand(0);
58211 APInt NewImm = Op0C->getAPIntValue() - 1;
58212 SDLoc DL(Op1);
58213 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58214 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58215 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58216 DAG.getConstant(NewImm, DL, VT));
58217 }
58218
58219 return SDValue();
58220}
58221
58223 if (N->getConstantOperandVal(3) != X86::COND_NE)
58224 return SDValue();
58225
58226 SDValue Sub = N->getOperand(4);
58227 if (Sub.getOpcode() != X86ISD::SUB)
58228 return SDValue();
58229
58230 SDValue Op1 = Sub.getOperand(1);
58231
58232 if (!X86::isZeroNode(Sub.getOperand(0)))
58233 return SDValue();
58234
58235 SDLoc DL(N);
58236 SmallVector<SDValue, 5> Ops(N->op_values());
58237 if (Op1.getOpcode() == X86ISD::SETCC) {
58238 // res, flags2 = sub 0, (setcc cc, flag)
58239 // cload/cstore ..., cond_ne, flag2
58240 // ->
58241 // cload/cstore cc, flag
58242 Ops[3] = Op1.getOperand(0);
58243 Ops[4] = Op1.getOperand(1);
58244 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58245 SDValue Src = Op1;
58246 SDValue Op10 = Op1.getOperand(0);
58247 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
58248 // res, flags2 = sub 0, (and (xor X, -1), Y)
58249 // cload/cstore ..., cond_ne, flag2
58250 // ->
58251 // res, flags2 = sub 0, (and X, Y)
58252 // cload/cstore ..., cond_e, flag2
58253 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58254 Op1.getOperand(1));
58255 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58256 }
58257 // res, flags2 = sub 0, (and X, Y)
58258 // cload/cstore ..., cc, flag2
58259 // ->
58260 // res, flags2 = cmp (and X, Y), 0
58261 // cload/cstore ..., cc, flag2
58262 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58263 } else {
58264 return SDValue();
58265 }
58266
58267 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58268 cast<MemSDNode>(N)->getMemoryVT(),
58269 cast<MemSDNode>(N)->getMemOperand());
58270}
58271
58274 const X86Subtarget &Subtarget) {
58275 EVT VT = N->getValueType(0);
58276 SDValue Op0 = N->getOperand(0);
58277 SDValue Op1 = N->getOperand(1);
58278 SDLoc DL(N);
58279
58280 auto IsNonOpaqueConstant = [&](SDValue Op) {
58282 /*AllowOpaques*/ false);
58283 };
58284
58285 // X86 can't encode an immediate LHS of a sub. See if we can push the
58286 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58287 // one use and a constant, invert the immediate, saving one register.
58288 // However, ignore cases where C1 is 0, as those will become a NEG.
58289 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58290 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58291 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58292 Op1->hasOneUse()) {
58293 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58294 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58295 SDValue NewAdd =
58296 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58297 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58298 }
58299
58300 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58301 return V;
58302
58303 // Try to synthesize horizontal subs from subs of shuffles.
58304 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58305 return V;
58306
58307 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58308 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58309 X86::isZeroNode(Op1.getOperand(1))) {
58310 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58311 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58312 Op1.getOperand(0), Op1.getOperand(2));
58313 }
58314
58315 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58316 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58317 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58318 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58319 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58320 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58321 Op1.getOperand(1), Op1.getOperand(2));
58322 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58323 }
58324
58325 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58326 return V;
58327
58328 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58329 return V;
58330
58331 return combineSubSetcc(N, DAG);
58332}
58333
58335 const X86Subtarget &Subtarget) {
58336 unsigned Opcode = N->getOpcode();
58337 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58338 "Unknown PCMP opcode");
58339
58340 SDValue LHS = N->getOperand(0);
58341 SDValue RHS = N->getOperand(1);
58342 MVT VT = N->getSimpleValueType(0);
58343 unsigned EltBits = VT.getScalarSizeInBits();
58344 unsigned NumElts = VT.getVectorNumElements();
58345 SDLoc DL(N);
58346
58347 if (LHS == RHS)
58348 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58349 : DAG.getConstant(0, DL, VT);
58350
58351 // Constant Folding.
58352 // PCMPEQ(X,UNDEF) -> UNDEF
58353 // PCMPGT(X,UNDEF) -> 0
58354 // PCMPGT(UNDEF,X) -> 0
58355 APInt LHSUndefs, RHSUndefs;
58356 SmallVector<APInt> LHSBits, RHSBits;
58357 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58358 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58359 APInt Ones = APInt::getAllOnes(EltBits);
58360 APInt Zero = APInt::getZero(EltBits);
58361 SmallVector<APInt> Results(NumElts);
58362 for (unsigned I = 0; I != NumElts; ++I) {
58363 if (Opcode == X86ISD::PCMPEQ) {
58364 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58365 } else {
58366 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58367 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58368 }
58369 }
58370 if (Opcode == X86ISD::PCMPEQ)
58371 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58372 return getConstVector(Results, VT, DAG, DL);
58373 }
58374
58375 return SDValue();
58376}
58377
58378// Helper to determine if we can convert an integer comparison to a float
58379// comparison byt casting the operands.
58380static std::optional<unsigned>
58381CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58382 unsigned NumSignificantBitsRHS) {
58383 MVT SVT = VT.getScalarType();
58384 assert(SVT == MVT::f32 && "Only tested for float so far");
58385 const fltSemantics &Sem = SVT.getFltSemantics();
58386 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58387 "Only PCMPEQ/PCMPGT currently supported");
58388
58389 // TODO: Handle bitcastable integers.
58390
58391 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58392 // a fp value.
58393 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58394 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58395 return ISD::SINT_TO_FP;
58396
58397 return std::nullopt;
58398}
58399
58400/// Helper that combines an array of subvector ops as if they were the operands
58401/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58402/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58405 const X86Subtarget &Subtarget,
58406 unsigned Depth) {
58407 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58408 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58409
58410 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58411 return DAG.getUNDEF(VT);
58412
58413 if (llvm::all_of(Ops, [](SDValue Op) {
58414 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58415 }))
58416 return getZeroVector(VT, Subtarget, DAG, DL);
58417
58419 return SDValue(); // Limit search depth.
58420
58421 SDValue Op0 = Ops[0];
58422 bool IsSplat = llvm::all_equal(Ops);
58423 unsigned NumOps = Ops.size();
58424 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58425 LLVMContext &Ctx = *DAG.getContext();
58426
58427 // Repeated subvectors.
58428 if (IsSplat &&
58429 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58430 // If this broadcast is inserted into both halves, use a larger broadcast.
58431 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58432 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58433
58434 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58435 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58436 (Subtarget.hasAVX2() ||
58438 VT.getScalarType(), Subtarget)))
58439 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58440 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58441 Op0.getOperand(0),
58442 DAG.getVectorIdxConstant(0, DL)));
58443
58444 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58445 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58446 (Subtarget.hasAVX2() ||
58447 (EltSizeInBits >= 32 &&
58448 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58449 Op0.getOperand(0).getValueType() == VT.getScalarType())
58450 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58451
58452 // concat_vectors(extract_subvector(splat(x)),
58453 // extract_subvector(splat(x))) -> splat(x)
58454 // concat_vectors(extract_subvector(subv_broadcast(x)),
58455 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58456 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58457 Op0.getOperand(0).getValueType() == VT) {
58458 SDValue SrcVec = Op0.getOperand(0);
58459 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58460 return SrcVec;
58461 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58462 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58463 return SrcVec;
58464 }
58465
58466 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58467 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58468 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58469 return DAG.getNode(Op0.getOpcode(), DL, VT,
58471 Op0.getOperand(0), Op0.getOperand(0)),
58472 Op0.getOperand(1));
58473 }
58474
58475 // TODO: This should go in combineX86ShufflesRecursively eventually.
58476 if (NumOps == 2) {
58477 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58478 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58479 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58481 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58482 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58483 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58484 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58485 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58486 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58487 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58488 // Only concat of subvector high halves which vperm2x128 is best at or if
58489 // it should fold into a subvector broadcast.
58490 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58491 SrcVT1.is256BitVector()) {
58492 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58493 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58494 "Bad subvector index");
58495 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58496 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58497 unsigned Index = 0;
58498 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58499 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58500 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58501 DAG.getBitcast(VT, Src0.getOperand(0)),
58502 DAG.getBitcast(VT, Src1.getOperand(0)),
58503 DAG.getTargetConstant(Index, DL, MVT::i8));
58504 }
58505 }
58506 // Widen extract_subvector
58507 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58508 // --> extract_subvector(x,lo)
58509 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58510 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58511 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58512 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58513 return DAG.getBitcast(VT,
58515 Src0.getConstantOperandVal(1),
58516 DAG, DL, VT.getSizeInBits()));
58517 }
58518 }
58519 }
58520
58521 // Repeated opcode.
58522 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58523 // but it currently struggles with different vector widths.
58524 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58525 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58526 })) {
58527 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58529 for (SDValue SubOp : SubOps)
58530 Subs.push_back(SubOp.getOperand(I));
58531 // Attempt to peek through bitcasts and concat the original subvectors.
58532 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58533 if (SubVT.isSimple() && SubVT.isVector()) {
58534 MVT ConcatVT =
58536 SubVT.getVectorElementCount() * Subs.size());
58537 for (SDValue &Sub : Subs)
58538 Sub = DAG.getBitcast(SubVT, Sub);
58539 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58540 Subtarget, Depth + 1))
58541 return DAG.getBitcast(VT, ConcatSrc);
58542 return DAG.getBitcast(
58543 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58544 }
58545 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58546 };
58547 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58548 bool AllConstants = true;
58549 bool AllSubs = true;
58550 unsigned VecSize = VT.getSizeInBits();
58551 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58552 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58553 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58554 }))
58555 return true;
58556 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58557 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58558 unsigned SubSize = BC.getValueSizeInBits();
58559 unsigned EltSize = BC.getScalarValueSizeInBits();
58560 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58562 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58563 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58564 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58565 }
58566 return AllConstants || AllSubs;
58567 };
58568 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58569 bool AllConstants = true;
58571 for (SDValue SubOp : SubOps) {
58572 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58573 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58575 Subs.push_back(SubOp.getOperand(I));
58576 }
58577 if (AllConstants)
58578 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58579 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58580 };
58581
58582 unsigned Opcode = Op0.getOpcode();
58583 switch (Opcode) {
58584 case ISD::BITCAST: {
58585 // TODO: Support AVX1/AVX2 bitcasts.
58587 for (SDValue SubOp : Ops)
58588 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58589 EVT InnerVT = SubOps[0].getValueType();
58590 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58591 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58592 (Subtarget.hasBWI() ||
58593 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58594 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58595 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58596 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58597 return Op.getValueType() == InnerVT;
58598 })) {
58599 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58600 MVT ConcatVT = MVT::getVectorVT(
58601 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58602 if (SDValue ConcatSrc = combineConcatVectorOps(
58603 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58604 return DAG.getBitcast(VT, ConcatSrc);
58605 }
58606 break;
58607 }
58608 case ISD::VECTOR_SHUFFLE: {
58609 // TODO: Generalize NumOps support.
58610 if (!IsSplat && NumOps == 2 &&
58611 ((VT.is256BitVector() &&
58612 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58613 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58614 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58615 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58616 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58617 if (Concat0 || Concat1 ||
58618 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58619 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58620 Subtarget.hasVBMI())) {
58621 int NumSubElts = Op0.getValueType().getVectorNumElements();
58622 SmallVector<int> NewMask;
58623 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58624 M = M >= NumSubElts ? M + NumSubElts : M;
58625 NewMask.push_back(M);
58626 }
58627 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58628 if (0 <= M)
58629 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58630 NewMask.push_back(M);
58631 }
58632 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58633 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58634 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58635 }
58636 }
58637 break;
58638 }
58639 case X86ISD::VBROADCAST: {
58640 // TODO: 512-bit VBROADCAST concatenation.
58641 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58642 return Op.getOperand(0).getValueType().is128BitVector();
58643 })) {
58644 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58645 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58646 ConcatSubOperand(VT, Ops, 0),
58647 ConcatSubOperand(VT, Ops, 0));
58648 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58649 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58650 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58652 DL, VT, ConcatSubOperand(VT, Ops, 0),
58653 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58654 }
58655 break;
58656 }
58657 case X86ISD::MOVDDUP:
58658 case X86ISD::MOVSHDUP:
58659 case X86ISD::MOVSLDUP: {
58660 if (!IsSplat && (VT.is256BitVector() ||
58661 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58662 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58663 break;
58664 }
58665 case X86ISD::SHUFP: {
58666 if (!IsSplat &&
58667 (VT == MVT::v8f32 ||
58668 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58669 llvm::all_of(Ops, [Op0](SDValue Op) {
58670 return Op.getOperand(2) == Op0.getOperand(2);
58671 })) {
58672 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58673 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58674 if (Concat0 || Concat1)
58675 return DAG.getNode(Opcode, DL, VT,
58676 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58677 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58678 Op0.getOperand(2));
58679 }
58680 break;
58681 }
58682 case X86ISD::UNPCKH:
58683 case X86ISD::UNPCKL: {
58684 // TODO: UNPCK should use CombineSubOperand
58685 // Don't concatenate build_vector patterns.
58686 if (!IsSplat &&
58687 ((VT.is256BitVector() &&
58688 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58689 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58690 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58691 none_of(Ops, [](SDValue Op) {
58692 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58694 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58696 })) {
58697 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58698 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58699 if (Concat0 || Concat1 ||
58700 (Subtarget.hasInt256() && EltSizeInBits == 64))
58701 return DAG.getNode(Opcode, DL, VT,
58702 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58703 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58704 }
58705 break;
58706 }
58707 case X86ISD::PSHUFHW:
58708 case X86ISD::PSHUFLW:
58709 case X86ISD::PSHUFD:
58710 if (!IsSplat &&
58711 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58712 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58713 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58714 llvm::all_of(Ops, [Op0](SDValue Op) {
58715 return Op.getOperand(1) == Op0.getOperand(1);
58716 })) {
58717 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58718 Op0.getOperand(1));
58719 }
58720 [[fallthrough]];
58721 case X86ISD::VPERMILPI:
58722 if (!IsSplat && EltSizeInBits == 32 &&
58723 (VT.is256BitVector() ||
58724 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58725 all_of(Ops, [&Op0](SDValue Op) {
58726 return Op0.getOperand(1) == Op.getOperand(1);
58727 })) {
58728 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58729 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58730 Res =
58731 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58732 return DAG.getBitcast(VT, Res);
58733 }
58734 break;
58735 case X86ISD::VPERMILPV:
58736 if (!IsSplat && (VT.is256BitVector() ||
58737 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58738 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58739 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58740 if (Concat0 || Concat1)
58741 return DAG.getNode(Opcode, DL, VT,
58742 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58743 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58744 }
58745 break;
58746 case X86ISD::PSHUFB:
58747 case X86ISD::PSADBW:
58748 case X86ISD::VPMADDUBSW:
58749 case X86ISD::VPMADDWD:
58750 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58751 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58752 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58753 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58754 NumOps * SrcVT.getVectorNumElements());
58755 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58756 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58757 if (Concat0 || Concat1)
58758 return DAG.getNode(
58759 Opcode, DL, VT,
58760 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58761 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58762 }
58763 break;
58764 case X86ISD::VPERMV:
58765 // TODO: Handle 256-bit and NumOps == 4 cases.
58766 if (!IsSplat && NumOps == 2 &&
58767 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58768 MVT OpVT = Op0.getSimpleValueType();
58769 int NumSrcElts = OpVT.getVectorNumElements();
58770 SmallVector<int, 64> ConcatMask;
58771 for (unsigned i = 0; i != NumOps; ++i) {
58772 SmallVector<int, 64> SubMask;
58774 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58775 break;
58776 for (int M : SubMask) {
58777 if (0 <= M)
58778 M += i * NumSrcElts;
58779 ConcatMask.push_back(M);
58780 }
58781 }
58782 if (ConcatMask.size() == (NumOps * NumSrcElts))
58783 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58784 ConcatSubOperand(VT, Ops, 1),
58785 DAG.getUNDEF(VT), Subtarget, DAG);
58786 }
58787 break;
58788 case X86ISD::VPERMV3:
58789 // TODO: Handle 256-bit and NumOps == 4 cases.
58790 if (!IsSplat && NumOps == 2 &&
58791 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58792 MVT OpVT = Op0.getSimpleValueType();
58793 int NumSrcElts = OpVT.getVectorNumElements();
58794 SmallVector<int, 64> ConcatMask;
58795 for (unsigned i = 0; i != NumOps; ++i) {
58796 SmallVector<int, 64> SubMask;
58798 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58799 break;
58800 for (int M : SubMask) {
58801 if (0 <= M) {
58802 int Src = M < NumSrcElts ? 0 : 2;
58803 M += M < NumSrcElts ? 0 : NumSrcElts;
58804
58805 // Reference the lowest sub if the upper sub is the same.
58806 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58807 M += i * NumSrcElts;
58808 }
58809 ConcatMask.push_back(M);
58810 }
58811 }
58812 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58813 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58814 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58815 if (Concat0 || Concat1)
58816 return lowerShuffleWithPERMV(
58817 DL, VT, ConcatMask,
58818 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58819 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58820 DAG);
58821 }
58822 }
58823 break;
58824 case X86ISD::VPERM2X128: {
58825 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58826 assert(NumOps == 2 && "Bad concat_vectors operands");
58827 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58828 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58829 // TODO: Handle zero'd subvectors.
58830 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58831 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58832 (int)((Imm1 >> 4) & 0x3)};
58833 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58834 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58835 Ops[0].getOperand(1), DAG, DL);
58836 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58837 Ops[1].getOperand(1), DAG, DL);
58838 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58839 DAG.getBitcast(ShuffleVT, LHS),
58840 DAG.getBitcast(ShuffleVT, RHS),
58841 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58842 return DAG.getBitcast(VT, Res);
58843 }
58844 }
58845 break;
58846 }
58847 case X86ISD::SHUF128: {
58848 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58849 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58850 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58851 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58852 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58853 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58854 Ops[0].getOperand(1), DAG, DL);
58855 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58856 Ops[1].getOperand(1), DAG, DL);
58857 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58858 DAG.getTargetConstant(Imm, DL, MVT::i8));
58859 }
58860 break;
58861 }
58862 case ISD::TRUNCATE:
58863 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58864 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58865 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58866 SrcVT == Ops[1].getOperand(0).getValueType() &&
58867 Subtarget.useAVX512Regs() &&
58868 Subtarget.getPreferVectorWidth() >= 512 &&
58869 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58870 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58871 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58872 ConcatSubOperand(NewSrcVT, Ops, 0));
58873 }
58874 }
58875 break;
58876 case ISD::ANY_EXTEND:
58877 case ISD::SIGN_EXTEND:
58878 case ISD::ZERO_EXTEND:
58879 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58880 if (!IsSplat && NumOps == 2 &&
58881 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58882 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58883 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58884 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58885 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58886 SrcVT == Ops[1].getOperand(0).getValueType()) {
58887 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58888 return DAG.getNode(Opcode, DL, VT,
58889 ConcatSubOperand(NewSrcVT, Ops, 0));
58890 }
58891 }
58892 break;
58896 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58897 if (!IsSplat && NumOps == 2 &&
58898 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58899 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58900 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58902 Op0.getOperand(0).getValueType() ==
58903 Ops[0].getOperand(0).getValueType()) {
58904 EVT SrcVT = Op0.getOperand(0).getValueType();
58905 unsigned NumElts = VT.getVectorNumElements();
58906 MVT UnpackSVT =
58907 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58908 MVT UnpackVT =
58909 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58910 SDValue Unpack =
58911 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58912 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58913 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58914 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58915 DAG.getBitcast(SrcVT, Unpack), DAG);
58916 }
58917 break;
58918 }
58919 case X86ISD::VSHLI:
58920 case X86ISD::VSRLI:
58921 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
58922 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
58923 llvm::all_of(Ops, [](SDValue Op) {
58924 return Op.getConstantOperandAPInt(1) == 32;
58925 })) {
58926 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
58927 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
58928 Res = DAG.getBitcast(MVT::v8i32, Res);
58929 if (Opcode == X86ISD::VSHLI) {
58930 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58931 {8, 0, 8, 2, 8, 4, 8, 6});
58932 } else {
58933 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
58934 {1, 8, 3, 8, 5, 8, 7, 8});
58935 }
58936 return DAG.getBitcast(VT, Res);
58937 }
58938 }
58939 [[fallthrough]];
58940 case X86ISD::VSRAI:
58941 case X86ISD::VSHL:
58942 case X86ISD::VSRL:
58943 case X86ISD::VSRA:
58944 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
58945 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58946 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58947 llvm::all_of(Ops, [Op0](SDValue Op) {
58948 return Op0.getOperand(1) == Op.getOperand(1);
58949 })) {
58950 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58951 Op0.getOperand(1));
58952 }
58953 break;
58954 case X86ISD::VPERMI:
58955 case X86ISD::VROTLI:
58956 case X86ISD::VROTRI:
58957 if (!IsSplat &&
58958 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58959 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58960 llvm::all_of(Ops, [Op0](SDValue Op) {
58961 return Op0.getOperand(1) == Op.getOperand(1);
58962 })) {
58963 assert(!(Opcode == X86ISD::VPERMI &&
58964 Op0.getValueType().is128BitVector()) &&
58965 "Illegal 128-bit X86ISD::VPERMI nodes");
58966 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58967 Op0.getOperand(1));
58968 }
58969 break;
58970 case ISD::AND:
58971 case ISD::OR:
58972 case ISD::XOR:
58973 case X86ISD::ANDNP:
58974 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
58975 if (!IsSplat && (VT.is256BitVector() ||
58976 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58977 // Don't concatenate root AVX1 NOT patterns.
58978 // TODO: Allow NOT folding if Concat0 succeeds.
58979 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
58980 llvm::all_of(Ops, [](SDValue X) {
58981 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
58982 }))
58983 break;
58984 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58985 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58986 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
58987 return DAG.getNode(Opcode, DL, VT,
58988 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58989 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58990 }
58991 break;
58992 case X86ISD::PCMPEQ:
58993 case X86ISD::PCMPGT:
58994 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
58995 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
58996 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58997 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58998 if (Concat0 || Concat1)
58999 return DAG.getNode(Opcode, DL, VT,
59000 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59001 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59002 break;
59003 }
59004
59005 if (!IsSplat && VT == MVT::v8i32) {
59006 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59007 // TODO: Handle v4f64 as well?
59008 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59009 for (unsigned I = 0; I != NumOps; ++I) {
59010 MaxSigBitsLHS =
59011 std::max(MaxSigBitsLHS,
59012 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59013 MaxSigBitsRHS =
59014 std::max(MaxSigBitsRHS,
59015 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59016 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59017 break;
59018 }
59019
59020 ISD::CondCode ICC =
59021 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59022 ISD::CondCode FCC =
59024
59025 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59026 MVT FpVT = VT.changeVectorElementType(FpSVT);
59027
59028 if (std::optional<unsigned> CastOpc =
59029 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59030 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59031 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59032 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59033 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59034 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59035 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59036
59037 bool IsAlwaysSignaling;
59038 unsigned FSETCC =
59039 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59040 return DAG.getBitcast(
59041 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59042 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59043 }
59044 }
59045 break;
59046 case ISD::CTPOP:
59047 case ISD::CTTZ:
59048 case ISD::CTLZ:
59051 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59052 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59053 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59054 }
59055 break;
59057 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59058 if (!IsSplat &&
59059 (VT.is256BitVector() ||
59060 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59061 llvm::all_of(Ops, [Op0](SDValue Op) {
59062 return Op0.getOperand(2) == Op.getOperand(2);
59063 })) {
59064 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59065 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59066 }
59067 break;
59068 case ISD::ADD:
59069 case ISD::SUB:
59070 case ISD::MUL:
59071 // TODO: Add more integer binops?
59072 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59073 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59074 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59075 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59076 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59077 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59078 return Op.getOperand(0) == Op.getOperand(1);
59079 }))
59080 return DAG.getNode(Opcode, DL, VT,
59081 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59082 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59083 }
59084 break;
59085 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59086 // their latency are short, so here we don't replace them unless we won't
59087 // introduce extra VINSERT.
59088 case ISD::FADD:
59089 case ISD::FSUB:
59090 case ISD::FMUL:
59091 if (!IsSplat && (VT.is256BitVector() ||
59092 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59093 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59094 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59095 if (Concat0 || Concat1)
59096 return DAG.getNode(Opcode, DL, VT,
59097 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59098 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59099 }
59100 break;
59101 // Always prefer to concatenate high latency FDIV instructions.
59102 case ISD::FDIV:
59103 if (!IsSplat && (VT.is256BitVector() ||
59104 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59105 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59106 ConcatSubOperand(VT, Ops, 1));
59107 }
59108 break;
59109 case X86ISD::HADD:
59110 case X86ISD::HSUB:
59111 case X86ISD::FHADD:
59112 case X86ISD::FHSUB:
59113 if (!IsSplat && VT.is256BitVector() &&
59114 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59115 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59116 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59117 if (Concat0 || Concat1)
59118 return DAG.getNode(Opcode, DL, VT,
59119 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59120 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59121 }
59122 break;
59123 case X86ISD::PACKSS:
59124 case X86ISD::PACKUS:
59125 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59126 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59127 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59128 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59129 NumOps * SrcVT.getVectorNumElements());
59130 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59131 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59132 if (Concat0 || Concat1)
59133 return DAG.getNode(
59134 Opcode, DL, VT,
59135 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59136 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59137 }
59138 break;
59139 case X86ISD::VSHLD:
59140 case X86ISD::VSHRD:
59141 case X86ISD::PALIGNR:
59142 if (!IsSplat &&
59143 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59144 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59145 llvm::all_of(Ops, [Op0](SDValue Op) {
59146 return Op0.getOperand(2) == Op.getOperand(2);
59147 })) {
59148 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59149 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59150 if (Concat0 || Concat1)
59151 return DAG.getNode(Opcode, DL, VT,
59152 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59153 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59154 Op0.getOperand(2));
59155 }
59156 break;
59157 case X86ISD::BLENDI:
59158 if (VT.is256BitVector() && NumOps == 2 &&
59159 (EltSizeInBits >= 32 ||
59160 (Subtarget.hasInt256() &&
59161 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59162 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59163 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59164 if (Concat0 || Concat1) {
59165 unsigned NumElts = VT.getVectorNumElements();
59166 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59167 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59168 Mask = Mask.zextOrTrunc(8);
59169 return DAG.getNode(Opcode, DL, VT,
59170 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59171 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59172 DAG.getTargetConstant(Mask, DL, MVT::i8));
59173 }
59174 }
59175 // TODO: BWI targets should only use CombineSubOperand.
59176 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59177 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59178 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59179 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59180 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59181 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59182 unsigned NumElts = VT.getVectorNumElements();
59183 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59184 for (unsigned I = 1; I != NumOps; ++I)
59185 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59186 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59187 Mask = Mask.zextOrTrunc(NumMaskBits);
59188 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59189 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59190 SDValue Sel =
59191 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59192 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59193 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59194 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59195 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59196 }
59197 }
59198 break;
59199 case ISD::VSELECT:
59200 // TODO: VSELECT should use CombineSubOperand.
59201 if (!IsSplat && Subtarget.hasAVX512() &&
59202 (VT.is256BitVector() ||
59203 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59204 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59205 EVT SelVT = Ops[0].getOperand(0).getValueType();
59206 if (SelVT.getVectorElementType() == MVT::i1) {
59207 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59208 NumOps * SelVT.getVectorNumElements());
59209 if (TLI.isTypeLegal(SelVT))
59210 return DAG.getNode(
59211 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59212 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59213 }
59214 }
59215 [[fallthrough]];
59216 case X86ISD::BLENDV:
59217 // TODO: BLENDV should use CombineSubOperand.
59218 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59219 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59220 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59221 EVT SelVT = Ops[0].getOperand(0).getValueType();
59222 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59223 if (TLI.isTypeLegal(SelVT))
59224 return DAG.getNode(
59225 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59226 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59227 }
59228 break;
59229 }
59230 }
59231
59232 // Fold subvector loads into one.
59233 // If needed, look through bitcasts to get to the load.
59234 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59235 unsigned Fast;
59236 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59237 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59238 *FirstLd->getMemOperand(), &Fast) &&
59239 Fast) {
59240 if (SDValue Ld =
59241 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59242 return Ld;
59243 }
59244 }
59245
59246 // Attempt to fold target constant loads.
59247 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59248 SmallVector<APInt> EltBits;
59249 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59250 for (unsigned I = 0; I != NumOps; ++I) {
59251 APInt OpUndefElts;
59252 SmallVector<APInt> OpEltBits;
59253 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59254 OpEltBits, /*AllowWholeUndefs*/ true,
59255 /*AllowPartialUndefs*/ false))
59256 break;
59257 EltBits.append(OpEltBits);
59258 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59259 }
59260 if (EltBits.size() == VT.getVectorNumElements()) {
59261 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59262 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59263 SDValue CV = DAG.getConstantPool(C, PVT);
59266 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59267 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59269 return Ld;
59270 }
59271 }
59272
59273 // If this simple subvector or scalar/subvector broadcast_load is inserted
59274 // into both halves, use a larger broadcast_load. Update other uses to use
59275 // an extracted subvector.
59276 if (IsSplat &&
59277 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59278 if (ISD::isNormalLoad(Op0.getNode()) ||
59281 auto *Mem = cast<MemSDNode>(Op0);
59282 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59285 if (SDValue BcastLd =
59286 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59287 SDValue BcastSrc =
59288 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59289 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59290 return BcastLd;
59291 }
59292 }
59293 }
59294
59295 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59296 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59297 Subtarget.useAVX512Regs()) {
59298 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59299 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59300 Res = DAG.getBitcast(ShuffleVT, Res);
59301 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59302 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59303 return DAG.getBitcast(VT, Res);
59304 }
59305
59306 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59307 if (!IsSplat &&
59308 ((NumOps == 2 && VT == MVT::v4f64) ||
59309 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59310 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59311 // Collect the individual per-lane v2f64/v4f64 shuffles.
59312 MVT OpVT = Ops[0].getSimpleValueType();
59313 unsigned NumOpElts = OpVT.getVectorNumElements();
59316 if (all_of(seq<int>(NumOps), [&](int I) {
59317 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59318 Depth + 1) &&
59319 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59320 none_of(SrcMasks[I], isUndefOrZero) &&
59321 SrcMasks[I].size() == NumOpElts &&
59322 all_of(SrcOps[I], [&OpVT](SDValue V) {
59323 return V.getValueType() == OpVT;
59324 });
59325 })) {
59326 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59327 bool Unary = true;
59328 unsigned SHUFPDMask = 0;
59330 for (unsigned I = 0; I != NumOps; ++I) {
59331 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59332 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59333 Unary &= LHS[I] == RHS[I];
59334 for (unsigned J = 0; J != NumOpElts; ++J)
59335 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59336 }
59337 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59338 // PERMILPD mask and we can always profitably concatenate them.
59339 SDValue Concat0 =
59340 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59341 SDValue Concat1 =
59342 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59343 if (Unary || Concat0 || Concat1) {
59344 Concat0 =
59345 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59346 Concat1 =
59347 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59348 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59349 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59350 }
59351 }
59352 }
59353
59354 return SDValue();
59355}
59356
59359 const X86Subtarget &Subtarget) {
59360 EVT VT = N->getValueType(0);
59361 EVT SrcVT = N->getOperand(0).getValueType();
59362 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59364
59365 if (VT.getVectorElementType() == MVT::i1) {
59366 // Attempt to constant fold.
59367 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59369 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59371 if (!C) break;
59372 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59373 if (I == (E - 1)) {
59374 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59375 if (TLI.isTypeLegal(IntVT))
59376 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59377 }
59378 }
59379
59380 // Don't do anything else for i1 vectors.
59381 return SDValue();
59382 }
59383
59384 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59385 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59386 Subtarget))
59387 return R;
59388 }
59389
59390 return SDValue();
59391}
59392
59395 const X86Subtarget &Subtarget) {
59396 if (DCI.isBeforeLegalizeOps())
59397 return SDValue();
59398
59399 MVT OpVT = N->getSimpleValueType(0);
59400
59401 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59402
59403 SDLoc dl(N);
59404 SDValue Vec = N->getOperand(0);
59405 SDValue SubVec = N->getOperand(1);
59406
59407 uint64_t IdxVal = N->getConstantOperandVal(2);
59408 MVT SubVecVT = SubVec.getSimpleValueType();
59409 int VecNumElts = OpVT.getVectorNumElements();
59410 int SubVecNumElts = SubVecVT.getVectorNumElements();
59411
59412 if (Vec.isUndef() && SubVec.isUndef())
59413 return DAG.getUNDEF(OpVT);
59414
59415 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59416 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59417 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59418 return getZeroVector(OpVT, Subtarget, DAG, dl);
59419
59421 // If we're inserting into a zero vector and then into a larger zero vector,
59422 // just insert into the larger zero vector directly.
59423 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59425 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59426 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59427 getZeroVector(OpVT, Subtarget, DAG, dl),
59428 SubVec.getOperand(1),
59429 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59430 }
59431
59432 // If we're inserting into a zero vector and our input was extracted from an
59433 // insert into a zero vector of the same type and the extraction was at
59434 // least as large as the original insertion. Just insert the original
59435 // subvector into a zero vector.
59436 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59437 isNullConstant(SubVec.getOperand(1)) &&
59439 SDValue Ins = SubVec.getOperand(0);
59440 if (isNullConstant(Ins.getOperand(2)) &&
59441 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59442 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59443 SubVecVT.getFixedSizeInBits())
59444 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59445 getZeroVector(OpVT, Subtarget, DAG, dl),
59446 Ins.getOperand(1), N->getOperand(2));
59447 }
59448 }
59449
59450 // Stop here if this is an i1 vector.
59451 if (IsI1Vector)
59452 return SDValue();
59453
59454 // Eliminate an intermediate vector widening:
59455 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59456 // insert_subvector X, Y, Idx
59457 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59458 // there?
59459 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59460 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59461 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59462 SubVec.getOperand(1), N->getOperand(2));
59463
59464 // If this is an insert of an extract, combine to a shuffle. Don't do this
59465 // if the insert or extract can be represented with a subregister operation.
59466 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59467 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59468 (IdxVal != 0 ||
59469 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59470 SDValue ExtSrc = SubVec.getOperand(0);
59471 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59472 // Create a shuffle mask matching the extraction and insertion.
59473 SmallVector<int, 64> Mask(VecNumElts);
59474 std::iota(Mask.begin(), Mask.end(), 0);
59475 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59476 ExtIdxVal + VecNumElts);
59477 if (ExtIdxVal != 0)
59478 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59479 // See if we can use a blend instead of extract/insert pair.
59480 SmallVector<int, 64> BlendMask(VecNumElts);
59481 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59482 std::iota(BlendMask.begin() + IdxVal,
59483 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59484 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59485 VecNumElts == (2 * SubVecNumElts)) {
59486 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59487 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59488 SDValue Blend = DAG.getNode(
59489 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59490 DAG.getBitcast(MVT::v8f32, ExtSrc),
59491 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59492 return DAG.getBitcast(OpVT, Blend);
59493 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59494 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59495 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59496 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59497 SDValue Shuffle =
59498 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59499 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59500 return DAG.getBitcast(OpVT, Shuffle);
59501 }
59502 }
59503 }
59504
59505 // Match concat_vector style patterns.
59506 SmallVector<SDValue, 2> SubVectorOps;
59507 if (collectConcatOps(N, SubVectorOps, DAG)) {
59508 if (SDValue Fold =
59509 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59510 return Fold;
59511
59512 // If we're inserting all zeros into the upper half, change this to
59513 // a concat with zero. We will match this to a move
59514 // with implicit upper bit zeroing during isel.
59515 // We do this here because we don't want combineConcatVectorOps to
59516 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59517 if (SubVectorOps.size() == 2 &&
59518 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59519 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59520 getZeroVector(OpVT, Subtarget, DAG, dl),
59521 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59522
59523 // Attempt to recursively combine to a shuffle.
59524 if (all_of(SubVectorOps, [](SDValue SubOp) {
59526 })) {
59527 SDValue Op(N, 0);
59528 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59529 return Res;
59530 }
59531 }
59532
59533 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59534 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59535 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59536
59537 // If this is a broadcast load inserted into an upper undef, use a larger
59538 // broadcast load.
59539 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59540 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59541 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59543 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59544 }
59545
59546 // If we're splatting the lower half subvector of a full vector load into the
59547 // upper half, attempt to create a subvector broadcast.
59548 if ((int)IdxVal == (VecNumElts / 2) &&
59549 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59550 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59551 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59552 if (VecLd && SubLd &&
59554 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59556 SubVecVT, SubLd, 0, DAG);
59557 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59558 BcastLd, DAG.getVectorIdxConstant(0, dl));
59559 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59560 return BcastLd;
59561 }
59562 }
59563
59564 // Attempt to constant fold (if we're not widening).
59565 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59566 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59567 APInt VecUndefElts, SubUndefElts;
59568 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59569 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59570 VecEltBits) &&
59571 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59572 SubEltBits)) {
59573 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59574 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59575 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59576 }
59577 }
59578
59579 // Attempt to recursively combine to a shuffle.
59582 SDValue Op(N, 0);
59583 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59584 return Res;
59585 }
59586
59587 // Match insertion of subvector load that perfectly aliases a base load.
59588 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59589 ISD::isNormalLoad(SubVec.getNode()) &&
59591 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59592 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59593 return Vec;
59594
59595 return SDValue();
59596}
59597
59598/// If we are extracting a subvector of a vector select and the select condition
59599/// is composed of concatenated vectors, try to narrow the select width. This
59600/// is a common pattern for AVX1 integer code because 256-bit selects may be
59601/// legal, but there is almost no integer math/logic available for 256-bit.
59602/// This function should only be called with legal types (otherwise, the calls
59603/// to get simple value types will assert).
59605 SelectionDAG &DAG) {
59606 SDValue Sel = Ext->getOperand(0);
59607 if (Sel.getOpcode() != ISD::VSELECT ||
59608 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59609 return SDValue();
59610
59611 // Note: We assume simple value types because this should only be called with
59612 // legal operations/types.
59613 // TODO: This can be extended to handle extraction to 256-bits.
59614 MVT VT = Ext->getSimpleValueType(0);
59615 if (!VT.is128BitVector())
59616 return SDValue();
59617
59618 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59619 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59620 return SDValue();
59621
59622 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59623 MVT SelVT = Sel.getSimpleValueType();
59624 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59625 "Unexpected vector type with legal operations");
59626
59627 unsigned SelElts = SelVT.getVectorNumElements();
59628 unsigned CastedElts = WideVT.getVectorNumElements();
59629 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59630 if (SelElts % CastedElts == 0) {
59631 // The select has the same or more (narrower) elements than the extract
59632 // operand. The extraction index gets scaled by that factor.
59633 ExtIdx *= (SelElts / CastedElts);
59634 } else if (CastedElts % SelElts == 0) {
59635 // The select has less (wider) elements than the extract operand. Make sure
59636 // that the extraction index can be divided evenly.
59637 unsigned IndexDivisor = CastedElts / SelElts;
59638 if (ExtIdx % IndexDivisor != 0)
59639 return SDValue();
59640 ExtIdx /= IndexDivisor;
59641 } else {
59642 llvm_unreachable("Element count of simple vector types are not divisible?");
59643 }
59644
59645 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59646 unsigned NarrowElts = SelElts / NarrowingFactor;
59647 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59648 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59649 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59650 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59651 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59652 return DAG.getBitcast(VT, NarrowSel);
59653}
59654
59657 const X86Subtarget &Subtarget) {
59658 if (!N->getValueType(0).isSimple())
59659 return SDValue();
59660
59661 MVT VT = N->getSimpleValueType(0);
59662 SDValue InVec = N->getOperand(0);
59663 unsigned IdxVal = N->getConstantOperandVal(1);
59664 EVT InVecVT = InVec.getValueType();
59665 unsigned SizeInBits = VT.getSizeInBits();
59666 unsigned InSizeInBits = InVecVT.getSizeInBits();
59667 unsigned NumSubElts = VT.getVectorNumElements();
59668 unsigned NumInElts = InVecVT.getVectorNumElements();
59669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59670 SDLoc DL(N);
59671
59672 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59673 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59674 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59675 // We let generic combining take over from there to simplify the
59676 // insert/extract and 'not'.
59677 // This pattern emerges during AVX1 legalization. We handle it before lowering
59678 // to avoid complications like splitting constant vector loads.
59679 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59680 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59681 auto isConcatenatedNot = [](SDValue V) {
59682 V = peekThroughBitcasts(V);
59683 if (!isBitwiseNot(V))
59684 return false;
59685 SDValue NotOp = V->getOperand(0);
59687 };
59688 if (isConcatenatedNot(InVec.getOperand(0)) ||
59689 isConcatenatedNot(InVec.getOperand(1))) {
59690 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59691 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59692 splitVectorIntBinary(InVec, DAG, DL),
59693 N->getOperand(1));
59694 }
59695 }
59696
59697 if (DCI.isBeforeLegalizeOps())
59698 return SDValue();
59699
59700 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59701 return V;
59702
59704 return getZeroVector(VT, Subtarget, DAG, DL);
59705
59706 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59707 if (VT.getScalarType() == MVT::i1)
59708 return DAG.getConstant(1, DL, VT);
59709 return getOnesVector(VT, DAG, DL);
59710 }
59711
59712 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59713 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59714
59715 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59716 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59717 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59718 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59719 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59720 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59721 }
59722
59723 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59724 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59725 // iff SUB is entirely contained in the extraction.
59726 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59727 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59728 SDValue Src = InVec.getOperand(0);
59729 SDValue Sub = InVec.getOperand(1);
59730 EVT SubVT = Sub.getValueType();
59731 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59732 if (IdxVal <= InsIdx &&
59733 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59734 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59735 DAG.getVectorIdxConstant(IdxVal, DL));
59736 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59737 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59738 }
59739 }
59740
59741 // If we're extracting an upper subvector see if we'd get the same elements if
59742 // we extracted the lowest subvector instead which should allow
59743 // SimplifyDemandedVectorElts do more simplifications.
59744 if (IdxVal != 0) {
59745 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59746 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59747 });
59748 if (AllEquiv)
59749 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59750 }
59751
59752 // Check if we're extracting a whole broadcasted subvector.
59753 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59754 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59755 EVT MemVT = MemIntr->getMemoryVT();
59756 if (MemVT == VT) {
59757 // If this is the only use, we can replace with a regular load (this may
59758 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59759 // memory chain).
59760 if (InVec.hasOneUse()) {
59761 SDValue Ld =
59762 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59763 MemIntr->getMemOperand());
59764 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59765 return Ld;
59766 }
59767 }
59768 }
59769
59770 // Attempt to extract from the source of a shuffle vector.
59771 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59772 SmallVector<int, 32> ShuffleMask;
59773 SmallVector<int, 32> ScaledMask;
59774 SmallVector<SDValue, 2> ShuffleInputs;
59775 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59776 // Decode the shuffle mask and scale it so its shuffling subvectors.
59777 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59778 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59779 unsigned SubVecIdx = IdxVal / NumSubElts;
59780 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59781 return DAG.getUNDEF(VT);
59782 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59783 return getZeroVector(VT, Subtarget, DAG, DL);
59784 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59785 if (Src.getValueSizeInBits() == InSizeInBits) {
59786 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59787 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59788 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59789 DL, SizeInBits);
59790 }
59791 }
59792 }
59793
59794 auto IsExtractFree = [](SDValue V) {
59795 if (V.hasOneUse()) {
59797 if (V.getOpcode() == ISD::LOAD)
59798 return true;
59799 }
59800 V = peekThroughBitcasts(V);
59801 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59802 return true;
59804 return true;
59805 return V.isUndef();
59806 };
59807
59808 // If we're extracting the lowest subvector and we're the only user,
59809 // we may be able to perform this with a smaller vector width.
59810 unsigned InOpcode = InVec.getOpcode();
59811 if (InVec.hasOneUse()) {
59812 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59813 // v2f64 CVTDQ2PD(v4i32).
59814 if (InOpcode == ISD::SINT_TO_FP &&
59815 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59816 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59817 }
59818 // v2f64 CVTUDQ2PD(v4i32).
59819 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59820 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59821 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59822 }
59823 // v2f64 CVTPS2PD(v4f32).
59824 if (InOpcode == ISD::FP_EXTEND &&
59825 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59826 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59827 }
59828 }
59829 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59830 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59831 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59832 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59833 Subtarget.hasVLX())) &&
59834 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59835 SDValue Src = InVec.getOperand(0);
59836 if (Src.getValueType().getScalarSizeInBits() == 32)
59837 return DAG.getNode(InOpcode, DL, VT,
59838 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59839 }
59840 if (IdxVal == 0 &&
59841 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59842 (SizeInBits == 128 || SizeInBits == 256) &&
59843 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59844 SDValue Ext = InVec.getOperand(0);
59845 if (Ext.getValueSizeInBits() > SizeInBits)
59846 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59847 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59848 return DAG.getNode(ExtOp, DL, VT, Ext);
59849 }
59850 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59851 InVec.getOperand(0).getValueType().is256BitVector() &&
59852 InVec.getOperand(1).getValueType().is256BitVector() &&
59853 InVec.getOperand(2).getValueType().is256BitVector()) {
59854 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59855 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59856 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59857 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59858 }
59859 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59860 (SizeInBits == 128 || SizeInBits == 256)) {
59861 SDValue InVecSrc = InVec.getOperand(0);
59862 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59863 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59864 return DAG.getNode(InOpcode, DL, VT, Ext);
59865 }
59866
59867 if (SizeInBits == 128 || SizeInBits == 256) {
59868 switch (InOpcode) {
59869 case X86ISD::MOVDDUP:
59870 return DAG.getNode(
59871 InOpcode, DL, VT,
59872 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59873 case X86ISD::PSHUFD:
59874 case X86ISD::VPERMILPI:
59875 if (InVec.getOperand(0).hasOneUse()) {
59876 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59877 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59878 return DAG.getNode(InOpcode, DL, VT,
59879 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59880 DL, SizeInBits),
59881 DAG.getTargetConstant(M, DL, MVT::i8));
59882 }
59883 break;
59884 case X86ISD::PCMPEQ:
59885 case X86ISD::PCMPGT:
59886 case X86ISD::UNPCKH:
59887 case X86ISD::UNPCKL:
59888 if (IsExtractFree(InVec.getOperand(0)) ||
59889 IsExtractFree(InVec.getOperand(1)))
59890 return DAG.getNode(InOpcode, DL, VT,
59891 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59892 DL, SizeInBits),
59893 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59894 DL, SizeInBits));
59895 break;
59896 case X86ISD::CMPP:
59897 if (IsExtractFree(InVec.getOperand(0)) ||
59898 IsExtractFree(InVec.getOperand(1)))
59899 return DAG.getNode(InOpcode, DL, VT,
59900 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59901 DL, SizeInBits),
59902 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59903 DL, SizeInBits),
59904 InVec.getOperand(2));
59905 break;
59906 case X86ISD::BLENDI:
59907 if (IsExtractFree(InVec.getOperand(0)) ||
59908 IsExtractFree(InVec.getOperand(1))) {
59909 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59910 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59911 return DAG.getNode(InOpcode, DL, VT,
59912 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59913 DL, SizeInBits),
59914 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59915 DL, SizeInBits),
59916 DAG.getTargetConstant(M, DL, MVT::i8));
59917 }
59918 break;
59919 case X86ISD::VPERMV:
59920 if (IdxVal != 0) {
59921 SDValue Mask = InVec.getOperand(0);
59922 SDValue Src = InVec.getOperand(1);
59923 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59924 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59925 DL, InSizeInBits);
59926 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
59927 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59928 }
59929 break;
59930 case X86ISD::VPERMV3:
59931 if (IdxVal != 0) {
59932 SDValue Src0 = InVec.getOperand(0);
59933 SDValue Mask = InVec.getOperand(1);
59934 SDValue Src1 = InVec.getOperand(2);
59935 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
59936 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
59937 DL, InSizeInBits);
59938 SDValue Shuffle =
59939 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
59940 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
59941 }
59942 break;
59943 }
59944 }
59945 }
59946
59947 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
59948 // as this is very likely to fold into a shuffle/truncation.
59949 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
59950 InVecVT.getScalarSizeInBits() == 64 &&
59951 InVec.getConstantOperandAPInt(1) == 32) {
59952 SDValue Ext =
59953 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
59954 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
59955 }
59956
59957 return SDValue();
59958}
59959
59961 const X86Subtarget &Subtarget) {
59962 using namespace SDPatternMatch;
59963 EVT VT = N->getValueType(0);
59964 SDValue Src = N->getOperand(0);
59965 SDLoc DL(N);
59966
59967 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
59968 // This occurs frequently in our masked scalar intrinsic code and our
59969 // floating point select lowering with AVX512.
59970 // TODO: SimplifyDemandedBits instead?
59971 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
59972 isOneConstant(Src.getOperand(1)))
59973 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
59974
59975 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
59976 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
59977 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
59978 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
59979 isNullConstant(Src.getOperand(1)))
59980 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
59981 Src.getOperand(1));
59982
59983 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
59984 // TODO: Move to DAGCombine/SimplifyDemandedBits?
59985 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
59986 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
59987 if (Op.getValueType() != MVT::i64)
59988 return SDValue();
59989 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
59990 if (Op.getOpcode() == Opc &&
59991 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
59992 return Op.getOperand(0);
59993 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
59994 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
59995 if (Ld->getExtensionType() == Ext &&
59996 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
59997 return Op;
59998 if (IsZeroExt) {
59999 KnownBits Known = DAG.computeKnownBits(Op);
60000 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60001 return Op;
60002 }
60003 return SDValue();
60004 };
60005
60006 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60007 return DAG.getBitcast(
60008 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60009 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60010
60011 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60012 return DAG.getBitcast(
60013 VT,
60014 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60015 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60016 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60017 }
60018
60019 if (Src.getOpcode() == ISD::BITCAST) {
60020 SDValue SrcOp = Src.getOperand(0);
60021 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60022 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60023 return DAG.getBitcast(
60024 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60025 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60026 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60027 return DAG.getBitcast(
60028 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60029 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60030 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60031 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60032 }
60033
60034 if (VT == MVT::v4i32) {
60035 SDValue HalfSrc;
60036 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60037 // to remove XMM->GPR->XMM moves.
60038 if (sd_match(Src, m_AnyExt(m_BitCast(
60039 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60040 return DAG.getBitcast(
60041 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60042 }
60043
60044 // See if we're broadcasting the scalar value, in which case just reuse that.
60045 // Ensure the same SDValue from the SDNode use is being used.
60046 if (VT.getScalarType() == Src.getValueType())
60047 for (SDNode *User : Src->users())
60048 if (User->getOpcode() == X86ISD::VBROADCAST &&
60049 Src == User->getOperand(0)) {
60050 unsigned SizeInBits = VT.getFixedSizeInBits();
60051 unsigned BroadcastSizeInBits =
60052 User->getValueSizeInBits(0).getFixedValue();
60053 if (BroadcastSizeInBits == SizeInBits)
60054 return SDValue(User, 0);
60055 if (BroadcastSizeInBits > SizeInBits)
60056 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60057 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60058 // coverage.
60059 }
60060
60061 // Check for cases where we've ended up with a scalarized shift, typically
60062 // during type legalization.
60063 switch (Src.getOpcode()) {
60064 case ISD::SHL:
60065 case ISD::SRL:
60066 case ISD::SRA:
60067 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60068 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60069 Src.hasOneUse()) {
60070 SDValue SrcVec =
60071 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60072 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60073 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60074 Amt->getZExtValue(), DAG);
60075 }
60076 }
60077 break;
60078 case ISD::FSHL:
60079 case ISD::FSHR:
60080 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60081 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60082 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60083 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60084 Src.hasOneUse()) {
60085 uint64_t AmtVal =
60086 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60087 SDValue SrcVec0 =
60088 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60089 SDValue SrcVec1 =
60090 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60091 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60092 DAG.getConstant(AmtVal, DL, VT));
60093 }
60094 }
60095 break;
60096 }
60097
60098 return SDValue();
60099}
60100
60101// Simplify PMULDQ and PMULUDQ operations.
60104 const X86Subtarget &Subtarget) {
60105 SDValue LHS = N->getOperand(0);
60106 SDValue RHS = N->getOperand(1);
60107
60108 // Canonicalize constant to RHS.
60111 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60112
60113 // Multiply by zero.
60114 // Don't return RHS as it may contain UNDEFs.
60115 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60116 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60117
60118 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60119 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60120 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60121 return SDValue(N, 0);
60122
60123 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60124 // convert it to any_extend_invec, due to the LegalOperations check, do the
60125 // conversion directly to a vector shuffle manually. This exposes combine
60126 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60127 // combineX86ShufflesRecursively on SSE4.1 targets.
60128 // FIXME: This is basically a hack around several other issues related to
60129 // ANY_EXTEND_VECTOR_INREG.
60130 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60131 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60132 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60133 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60134 SDLoc dl(N);
60135 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60136 LHS.getOperand(0), { 0, -1, 1, -1 });
60137 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60138 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60139 }
60140 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60141 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60142 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60143 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60144 SDLoc dl(N);
60145 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60146 RHS.getOperand(0), { 0, -1, 1, -1 });
60147 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60148 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60149 }
60150
60151 return SDValue();
60152}
60153
60154// Simplify VPMADDUBSW/VPMADDWD operations.
60157 MVT VT = N->getSimpleValueType(0);
60158 SDValue LHS = N->getOperand(0);
60159 SDValue RHS = N->getOperand(1);
60160 unsigned Opc = N->getOpcode();
60161 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60163 "Unexpected PMADD opcode");
60164
60165 // Multiply by zero.
60166 // Don't return LHS/RHS as it may contain UNDEFs.
60167 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60169 return DAG.getConstant(0, SDLoc(N), VT);
60170
60171 // Constant folding.
60172 APInt LHSUndefs, RHSUndefs;
60173 SmallVector<APInt> LHSBits, RHSBits;
60174 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60175 unsigned DstEltBits = VT.getScalarSizeInBits();
60176 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60177 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60178 SmallVector<APInt> Result;
60179 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60180 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60181 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60182 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60183 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60184 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60185 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60186 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60187 Result.push_back(Res);
60188 }
60189 return getConstVector(Result, VT, DAG, SDLoc(N));
60190 }
60191
60192 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60193 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60194 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60195 return SDValue(N, 0);
60196
60197 return SDValue();
60198}
60199
60200// Simplify VPMADD52L/VPMADD52H operations.
60203 MVT VT = N->getSimpleValueType(0);
60204 unsigned NumEltBits = VT.getScalarSizeInBits();
60205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60206 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60207 DCI))
60208 return SDValue(N, 0);
60209
60210 return SDValue();
60211}
60212
60215 const X86Subtarget &Subtarget) {
60216 EVT VT = N->getValueType(0);
60217 SDValue In = N->getOperand(0);
60218 unsigned Opcode = N->getOpcode();
60219 unsigned InOpcode = In.getOpcode();
60220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60221 SDLoc DL(N);
60222
60223 // Try to merge vector loads and extend_inreg to an extload.
60224 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60225 In.hasOneUse()) {
60226 auto *Ld = cast<LoadSDNode>(In);
60227 if (Ld->isSimple()) {
60228 MVT SVT = In.getSimpleValueType().getVectorElementType();
60231 : ISD::ZEXTLOAD;
60232 EVT MemVT = VT.changeVectorElementType(SVT);
60233 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60234 SDValue Load = DAG.getExtLoad(
60235 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60236 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60237 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60238 return Load;
60239 }
60240 }
60241 }
60242
60243 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60244 if (Opcode == InOpcode)
60245 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60246
60247 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60248 // -> EXTEND_VECTOR_INREG(X).
60249 // TODO: Handle non-zero subvector indices.
60250 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60251 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60252 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60253 In.getValueSizeInBits())
60254 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60255
60256 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60257 // TODO: Move to DAGCombine?
60258 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60259 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60260 In.getValueSizeInBits() == VT.getSizeInBits()) {
60261 unsigned NumElts = VT.getVectorNumElements();
60262 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60263 EVT EltVT = In.getOperand(0).getValueType();
60264 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60265 for (unsigned I = 0; I != NumElts; ++I)
60266 Elts[I * Scale] = In.getOperand(I);
60267 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60268 }
60269
60270 // Attempt to combine as a shuffle on SSE41+ targets.
60271 if (Subtarget.hasSSE41()) {
60272 SDValue Op(N, 0);
60273 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60274 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60275 return Res;
60276 }
60277
60278 return SDValue();
60279}
60280
60283 EVT VT = N->getValueType(0);
60284 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60285 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60286 return DAG.getConstant(0, SDLoc(N), VT);
60287
60288 // Fold kshiftr(extract_subvector(X,C1),C2)
60289 // --> extract_subvector(kshiftr(X,C1+C2),0)
60290 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60291 if (N->getOpcode() == X86ISD::KSHIFTR) {
60292 SDLoc DL(N);
60293 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60294 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60295 SDValue Src = N->getOperand(0).getOperand(0);
60296 uint64_t Amt = N->getConstantOperandVal(1) +
60297 N->getOperand(0).getConstantOperandVal(1);
60298 EVT SrcVT = Src.getValueType();
60299 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60300 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60301 DAG.getTargetConstant(Amt, DL, MVT::i8));
60302 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60303 DAG.getVectorIdxConstant(0, DL));
60304 }
60305 }
60306 }
60307
60308 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60309 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60310 return SDValue(N, 0);
60311
60312 return SDValue();
60313}
60314
60315// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60316// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60317// extra instructions between the conversion due to going to scalar and back.
60319 const X86Subtarget &Subtarget) {
60320 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60321 return SDValue();
60322
60323 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60324 return SDValue();
60325
60326 if (N->getValueType(0) != MVT::f32 ||
60327 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60328 return SDValue();
60329
60330 SDLoc dl(N);
60331 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60332 N->getOperand(0).getOperand(0));
60333 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60334 DAG.getTargetConstant(4, dl, MVT::i32));
60335 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60336 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60337 DAG.getVectorIdxConstant(0, dl));
60338}
60339
60342 const X86Subtarget &Subtarget) {
60343 EVT VT = N->getValueType(0);
60344 bool IsStrict = N->isStrictFPOpcode();
60345 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60346 EVT SrcVT = Src.getValueType();
60347
60348 SDLoc dl(N);
60349 if (SrcVT.getScalarType() == MVT::bf16) {
60350 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60351 !IsStrict && Src.getOperand(0).getValueType() == VT)
60352 return Src.getOperand(0);
60353
60354 if (!SrcVT.isVector())
60355 return SDValue();
60356
60357 assert(!IsStrict && "Strict FP doesn't support BF16");
60358 if (VT.getVectorElementType() == MVT::f64) {
60359 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60360 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60361 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60362 }
60363 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60364 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60365 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60366 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60367 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60368 return DAG.getBitcast(VT, Src);
60369 }
60370
60371 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60372 return SDValue();
60373
60374 if (Subtarget.hasFP16())
60375 return SDValue();
60376
60377 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60378 return SDValue();
60379
60380 if (VT.getVectorElementType() != MVT::f32 &&
60381 VT.getVectorElementType() != MVT::f64)
60382 return SDValue();
60383
60384 unsigned NumElts = VT.getVectorNumElements();
60385 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60386 return SDValue();
60387
60388 // Convert the input to vXi16.
60389 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60390 Src = DAG.getBitcast(IntVT, Src);
60391
60392 // Widen to at least 8 input elements.
60393 if (NumElts < 8) {
60394 unsigned NumConcats = 8 / NumElts;
60395 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60396 : DAG.getConstant(0, dl, IntVT);
60397 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60398 Ops[0] = Src;
60399 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60400 }
60401
60402 // Destination is vXf32 with at least 4 elements.
60403 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60404 std::max(4U, NumElts));
60405 SDValue Cvt, Chain;
60406 if (IsStrict) {
60407 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60408 {N->getOperand(0), Src});
60409 Chain = Cvt.getValue(1);
60410 } else {
60411 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60412 }
60413
60414 if (NumElts < 4) {
60415 assert(NumElts == 2 && "Unexpected size");
60416 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60417 DAG.getVectorIdxConstant(0, dl));
60418 }
60419
60420 if (IsStrict) {
60421 // Extend to the original VT if necessary.
60422 if (Cvt.getValueType() != VT) {
60423 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60424 {Chain, Cvt});
60425 Chain = Cvt.getValue(1);
60426 }
60427 return DAG.getMergeValues({Cvt, Chain}, dl);
60428 }
60429
60430 // Extend to the original VT if necessary.
60431 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60432}
60433
60434// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60437 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60438 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60439 "Unknown broadcast load type");
60440
60441 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60442 SDValue Ptr = MemIntrin->getBasePtr();
60443 SDValue Chain = MemIntrin->getChain();
60444 EVT VT = N->getSimpleValueType(0);
60445 EVT MemVT = MemIntrin->getMemoryVT();
60446
60447 // Look at other users of our base pointer and try to find a wider broadcast.
60448 // The input chain and the size of the memory VT must match.
60449 for (SDNode *User : Ptr->users())
60450 if (User != N && User->getOpcode() == N->getOpcode() &&
60451 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60452 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60453 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60454 MemVT.getSizeInBits() &&
60455 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60457 MemIntrin->isSimple() && "Illegal broadcast load type");
60459 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60460 VT.getSizeInBits());
60461 Extract = DAG.getBitcast(VT, Extract);
60462 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60463 return Extract;
60464 }
60465
60466 return SDValue();
60467}
60468
60470 const X86Subtarget &Subtarget) {
60471 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60472 return SDValue();
60473
60474 bool IsStrict = N->isStrictFPOpcode();
60475 EVT VT = N->getValueType(0);
60476 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60477 EVT SrcVT = Src.getValueType();
60478
60479 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60480 SrcVT.getVectorElementType() != MVT::f32)
60481 return SDValue();
60482
60483 SDLoc dl(N);
60484
60485 SDValue Cvt, Chain;
60486 unsigned NumElts = VT.getVectorNumElements();
60487 if (Subtarget.hasFP16()) {
60488 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60489 // v4f32 (xint_to_fp v4i64))))
60490 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60491 // v8f16 (CVTXI2P v4i64)))
60492 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60493 Src.getNumOperands() == 2) {
60494 SDValue Cvt0, Cvt1;
60495 SDValue Op0 = Src.getOperand(0);
60496 SDValue Op1 = Src.getOperand(1);
60497 bool IsOp0Strict = Op0->isStrictFPOpcode();
60498 if (Op0.getOpcode() != Op1.getOpcode() ||
60499 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60500 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60501 return SDValue();
60502 }
60503 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60504 if (IsStrict) {
60505 assert(IsOp0Strict && "Op0 must be strict node");
60506 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60509 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60510 {Op0.getOperand(0), Op0.getOperand(1)});
60511 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60512 {Op1.getOperand(0), Op1.getOperand(1)});
60513 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60514 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60515 }
60516 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60518 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60519 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60520 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60521 }
60522 return SDValue();
60523 }
60524
60525 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60526 return SDValue();
60527
60528 // Widen to at least 4 input elements.
60529 if (NumElts < 4)
60530 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60531 DAG.getConstantFP(0.0, dl, SrcVT));
60532
60533 // Destination is v8i16 with at least 8 elements.
60534 EVT CvtVT =
60535 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60536 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60537 if (IsStrict) {
60538 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60539 {N->getOperand(0), Src, Rnd});
60540 Chain = Cvt.getValue(1);
60541 } else {
60542 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60543 }
60544
60545 // Extract down to real number of elements.
60546 if (NumElts < 8) {
60548 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60549 DAG.getVectorIdxConstant(0, dl));
60550 }
60551
60552 Cvt = DAG.getBitcast(VT, Cvt);
60553
60554 if (IsStrict)
60555 return DAG.getMergeValues({Cvt, Chain}, dl);
60556
60557 return Cvt;
60558}
60559
60561 SDValue Src = N->getOperand(0);
60562
60563 // Turn MOVDQ2Q+simple_load into an mmx load.
60564 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60565 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60566
60567 if (LN->isSimple()) {
60568 SDValue NewLd =
60569 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60570 LN->getPointerInfo(), LN->getBaseAlign(),
60571 LN->getMemOperand()->getFlags());
60572 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60573 return NewLd;
60574 }
60575 }
60576
60577 return SDValue();
60578}
60579
60582 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60584 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60585 return SDValue(N, 0);
60586
60587 return SDValue();
60588}
60589
60590// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60591// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60592// use x86mmx instead.
60594 SDLoc dl(N);
60595
60596 bool MadeChange = false, CastReturnVal = false;
60598 for (const SDValue &Arg : N->op_values()) {
60599 if (Arg.getValueType() == MVT::v1i64) {
60600 MadeChange = true;
60601 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60602 } else
60603 Args.push_back(Arg);
60604 }
60605 SDVTList VTs = N->getVTList();
60606 SDVTList NewVTs = VTs;
60607 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60608 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60609 NewVTArr[0] = MVT::x86mmx;
60610 NewVTs = DAG.getVTList(NewVTArr);
60611 MadeChange = true;
60612 CastReturnVal = true;
60613 }
60614
60615 if (MadeChange) {
60616 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60617 if (CastReturnVal) {
60619 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60620 Returns.push_back(Result.getValue(i));
60621 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60622 return DAG.getMergeValues(Returns, dl);
60623 }
60624 return Result;
60625 }
60626 return SDValue();
60627}
60630 if (!DCI.isBeforeLegalize())
60631 return SDValue();
60632
60633 unsigned IntNo = N->getConstantOperandVal(0);
60634 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60635
60636 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60637 return FixupMMXIntrinsicTypes(N, DAG);
60638
60639 return SDValue();
60640}
60641
60644 if (!DCI.isBeforeLegalize())
60645 return SDValue();
60646
60647 unsigned IntNo = N->getConstantOperandVal(1);
60648 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60649
60650 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60651 return FixupMMXIntrinsicTypes(N, DAG);
60652
60653 return SDValue();
60654}
60655
60658 if (!DCI.isBeforeLegalize())
60659 return SDValue();
60660
60661 unsigned IntNo = N->getConstantOperandVal(1);
60662 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60663
60664 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60665 return FixupMMXIntrinsicTypes(N, DAG);
60666
60667 return SDValue();
60668}
60669
60671 DAGCombinerInfo &DCI) const {
60672 SelectionDAG &DAG = DCI.DAG;
60673 switch (N->getOpcode()) {
60674 // clang-format off
60675 default: break;
60677 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60679 case X86ISD::PEXTRW:
60680 case X86ISD::PEXTRB:
60681 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60683 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60685 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60687 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60688 case ISD::VSELECT:
60689 case ISD::SELECT:
60690 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60691 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60692 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60693 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60694 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60695 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60696 case X86ISD::ADD:
60697 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60698 case X86ISD::CLOAD:
60699 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60700 case X86ISD::SBB: return combineSBB(N, DAG);
60701 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60702 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60703 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60704 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60705 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60706 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60707 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60708 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60709 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60710 case ISD::AVGCEILS:
60711 case ISD::AVGCEILU:
60712 case ISD::AVGFLOORS:
60713 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60714 case X86ISD::BEXTR:
60715 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60716 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60717 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60718 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60719 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60721 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60722 case ISD::SINT_TO_FP:
60724 return combineSIntToFP(N, DAG, DCI, Subtarget);
60725 case ISD::UINT_TO_FP:
60727 return combineUIntToFP(N, DAG, Subtarget);
60728 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60729 case ISD::LRINT:
60730 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60731 case ISD::FADD:
60732 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60733 case X86ISD::VFCMULC:
60734 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60735 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60736 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60737 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60738 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60739 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60740 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60741 case X86ISD::FXOR:
60742 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60743 case X86ISD::FMIN:
60744 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60745 case ISD::FMINNUM:
60746 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60747 case X86ISD::CVTSI2P:
60748 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60749 case X86ISD::CVTP2SI:
60750 case X86ISD::CVTP2UI:
60752 case X86ISD::CVTTP2SI:
60754 case X86ISD::CVTTP2UI:
60755 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60757 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60758 case X86ISD::BT: return combineBT(N, DAG, DCI);
60759 case ISD::ANY_EXTEND:
60760 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60761 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60762 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60766 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60767 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60768 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60769 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60770 case X86ISD::PACKSS:
60771 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60772 case X86ISD::HADD:
60773 case X86ISD::HSUB:
60774 case X86ISD::FHADD:
60775 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60776 case X86ISD::VSHL:
60777 case X86ISD::VSRA:
60778 case X86ISD::VSRL:
60779 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60780 case X86ISD::VSHLI:
60781 case X86ISD::VSRAI:
60782 case X86ISD::VSRLI:
60783 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60785 case X86ISD::PINSRB:
60786 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60787 case X86ISD::SHUFP: // Handle all target specific shuffles
60788 case X86ISD::INSERTPS:
60789 case X86ISD::EXTRQI:
60790 case X86ISD::INSERTQI:
60791 case X86ISD::VALIGN:
60792 case X86ISD::PALIGNR:
60793 case X86ISD::VSHLDQ:
60794 case X86ISD::VSRLDQ:
60795 case X86ISD::BLENDI:
60796 case X86ISD::UNPCKH:
60797 case X86ISD::UNPCKL:
60798 case X86ISD::MOVHLPS:
60799 case X86ISD::MOVLHPS:
60800 case X86ISD::PSHUFB:
60801 case X86ISD::PSHUFD:
60802 case X86ISD::PSHUFHW:
60803 case X86ISD::PSHUFLW:
60804 case X86ISD::MOVSHDUP:
60805 case X86ISD::MOVSLDUP:
60806 case X86ISD::MOVDDUP:
60807 case X86ISD::MOVSS:
60808 case X86ISD::MOVSD:
60809 case X86ISD::MOVSH:
60810 case X86ISD::VBROADCAST:
60811 case X86ISD::VPPERM:
60812 case X86ISD::VPERMI:
60813 case X86ISD::VPERMV:
60814 case X86ISD::VPERMV3:
60815 case X86ISD::VPERMIL2:
60816 case X86ISD::VPERMILPI:
60817 case X86ISD::VPERMILPV:
60818 case X86ISD::VPERM2X128:
60819 case X86ISD::SHUF128:
60820 case X86ISD::VZEXT_MOVL:
60821 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60822 case X86ISD::FMADD_RND:
60823 case X86ISD::FMSUB:
60825 case X86ISD::FMSUB_RND:
60826 case X86ISD::FNMADD:
60828 case X86ISD::FNMADD_RND:
60829 case X86ISD::FNMSUB:
60831 case X86ISD::FNMSUB_RND:
60832 case ISD::FMA:
60833 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60836 case X86ISD::FMADDSUB:
60837 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60838 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60839 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60840 case X86ISD::MGATHER:
60841 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60842 case ISD::MGATHER:
60843 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60844 case X86ISD::PCMPEQ:
60845 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60846 case X86ISD::PMULDQ:
60847 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60848 case X86ISD::VPMADDUBSW:
60849 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60850 case X86ISD::VPMADD52L:
60851 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60852 case X86ISD::KSHIFTL:
60853 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60854 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60856 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60858 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60860 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60861 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60862 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60863 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60864 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60865 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60867 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60868 // clang-format on
60869 }
60870
60871 return SDValue();
60872}
60873
60875 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60876}
60877
60878// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60880 EVT ExtVT) const {
60881 return Subtarget.hasAVX512() || !VT.isVector();
60882}
60883
60885 if (!isTypeLegal(VT))
60886 return false;
60887
60888 // There are no vXi8 shifts.
60889 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60890 return false;
60891
60892 // TODO: Almost no 8-bit ops are desirable because they have no actual
60893 // size/speed advantages vs. 32-bit ops, but they do have a major
60894 // potential disadvantage by causing partial register stalls.
60895 //
60896 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60897 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60898 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
60899 // check for a constant operand to the multiply.
60900 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
60901 return false;
60902
60903 // i16 instruction encodings are longer and some i16 instructions are slow,
60904 // so those are not desirable.
60905 if (VT == MVT::i16) {
60906 switch (Opc) {
60907 default:
60908 break;
60909 case ISD::LOAD:
60910 case ISD::SIGN_EXTEND:
60911 case ISD::ZERO_EXTEND:
60912 case ISD::ANY_EXTEND:
60913 case ISD::MUL:
60914 return false;
60915 case ISD::SHL:
60916 case ISD::SRA:
60917 case ISD::SRL:
60918 case ISD::SUB:
60919 case ISD::ADD:
60920 case ISD::AND:
60921 case ISD::OR:
60922 case ISD::XOR:
60923 // NDD instruction never has "partial register write" issue b/c it has
60924 // destination register's upper bits [63:OSIZE]) zeroed even when
60925 // OSIZE=8/16.
60926 return Subtarget.hasNDD();
60927 }
60928 }
60929
60930 // Any legal type not explicitly accounted for above here is desirable.
60931 return true;
60932}
60933
60935 SDValue Value, SDValue Addr,
60936 int JTI,
60937 SelectionDAG &DAG) const {
60938 const Module *M = DAG.getMachineFunction().getFunction().getParent();
60939 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
60940 if (IsCFProtectionSupported) {
60941 // In case control-flow branch protection is enabled, we need to add
60942 // notrack prefix to the indirect branch.
60943 // In order to do that we create NT_BRIND SDNode.
60944 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
60945 SDValue Chain = Value;
60946 // Jump table debug info is only needed if CodeView is enabled.
60948 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
60949 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
60950 }
60951
60952 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
60953}
60954
60957 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
60959 EVT VT = LogicOp->getValueType(0);
60960 EVT OpVT = SETCC0->getOperand(0).getValueType();
60961 if (!VT.isInteger())
60963
60964 if (VT.isVector())
60969
60970 // Don't use `NotAnd` as even though `not` is generally shorter code size than
60971 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
60972 // `NotAnd` applies, `AddAnd` does as well.
60973 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
60974 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
60976}
60977
60979 EVT VT = Op.getValueType();
60980 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
60981 isa<ConstantSDNode>(Op.getOperand(1));
60982
60983 // i16 is legal, but undesirable since i16 instruction encodings are longer
60984 // and some i16 instructions are slow.
60985 // 8-bit multiply-by-constant can usually be expanded to something cheaper
60986 // using LEA and/or other ALU ops.
60987 if (VT != MVT::i16 && !Is8BitMulByConstant)
60988 return false;
60989
60990 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
60991 if (!Op.hasOneUse())
60992 return false;
60993 SDNode *User = *Op->user_begin();
60995 return false;
60996 auto *Ld = cast<LoadSDNode>(Load);
60997 auto *St = cast<StoreSDNode>(User);
60998 return Ld->getBasePtr() == St->getBasePtr();
60999 };
61000
61001 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61002 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61003 return false;
61004 if (!Op.hasOneUse())
61005 return false;
61006 SDNode *User = *Op->user_begin();
61007 if (User->getOpcode() != ISD::ATOMIC_STORE)
61008 return false;
61009 auto *Ld = cast<AtomicSDNode>(Load);
61010 auto *St = cast<AtomicSDNode>(User);
61011 return Ld->getBasePtr() == St->getBasePtr();
61012 };
61013
61014 auto IsFoldableZext = [](SDValue Op) {
61015 if (!Op.hasOneUse())
61016 return false;
61017 SDNode *User = *Op->user_begin();
61018 EVT VT = User->getValueType(0);
61019 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61020 (VT == MVT::i32 || VT == MVT::i64));
61021 };
61022
61023 bool Commute = false;
61024 switch (Op.getOpcode()) {
61025 default: return false;
61026 case ISD::SIGN_EXTEND:
61027 case ISD::ZERO_EXTEND:
61028 case ISD::ANY_EXTEND:
61029 break;
61030 case ISD::SHL:
61031 case ISD::SRA:
61032 case ISD::SRL: {
61033 SDValue N0 = Op.getOperand(0);
61034 // Look out for (store (shl (load), x)).
61035 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61036 return false;
61037 break;
61038 }
61039 case ISD::MUL:
61040 // When ZU is enabled, we prefer to not promote for MUL by a constant
61041 // when there is an opportunity to fold a zext with imulzu.
61042 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61043 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61044 isa<ConstantSDNode>(Op.getOperand(1))))
61045 return false;
61046 [[fallthrough]];
61047 case ISD::ADD:
61048 case ISD::AND:
61049 case ISD::OR:
61050 case ISD::XOR:
61051 Commute = true;
61052 [[fallthrough]];
61053 case ISD::SUB: {
61054 SDValue N0 = Op.getOperand(0);
61055 SDValue N1 = Op.getOperand(1);
61056 // Avoid disabling potential load folding opportunities.
61057 if (X86::mayFoldLoad(N1, Subtarget) &&
61058 (!Commute || !isa<ConstantSDNode>(N0) ||
61059 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61060 return false;
61061 if (X86::mayFoldLoad(N0, Subtarget) &&
61062 ((Commute && !isa<ConstantSDNode>(N1)) ||
61063 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61064 return false;
61065 if (IsFoldableAtomicRMW(N0, Op) ||
61066 (Commute && IsFoldableAtomicRMW(N1, Op)))
61067 return false;
61068 }
61069 }
61070
61071 PVT = MVT::i32;
61072 return true;
61073}
61074
61075//===----------------------------------------------------------------------===//
61076// X86 Inline Assembly Support
61077//===----------------------------------------------------------------------===//
61078
61081 .Case("{@cca}", X86::COND_A)
61082 .Case("{@ccae}", X86::COND_AE)
61083 .Case("{@ccb}", X86::COND_B)
61084 .Case("{@ccbe}", X86::COND_BE)
61085 .Case("{@ccc}", X86::COND_B)
61086 .Case("{@cce}", X86::COND_E)
61087 .Case("{@ccz}", X86::COND_E)
61088 .Case("{@ccg}", X86::COND_G)
61089 .Case("{@ccge}", X86::COND_GE)
61090 .Case("{@ccl}", X86::COND_L)
61091 .Case("{@ccle}", X86::COND_LE)
61092 .Case("{@ccna}", X86::COND_BE)
61093 .Case("{@ccnae}", X86::COND_B)
61094 .Case("{@ccnb}", X86::COND_AE)
61095 .Case("{@ccnbe}", X86::COND_A)
61096 .Case("{@ccnc}", X86::COND_AE)
61097 .Case("{@ccne}", X86::COND_NE)
61098 .Case("{@ccnz}", X86::COND_NE)
61099 .Case("{@ccng}", X86::COND_LE)
61100 .Case("{@ccnge}", X86::COND_L)
61101 .Case("{@ccnl}", X86::COND_GE)
61102 .Case("{@ccnle}", X86::COND_G)
61103 .Case("{@ccno}", X86::COND_NO)
61104 .Case("{@ccnp}", X86::COND_NP)
61105 .Case("{@ccns}", X86::COND_NS)
61106 .Case("{@cco}", X86::COND_O)
61107 .Case("{@ccp}", X86::COND_P)
61108 .Case("{@ccs}", X86::COND_S)
61110 return Cond;
61111}
61112
61113/// Given a constraint letter, return the type of constraint for this target.
61116 if (Constraint.size() == 1) {
61117 switch (Constraint[0]) {
61118 case 'R':
61119 case 'q':
61120 case 'Q':
61121 case 'f':
61122 case 't':
61123 case 'u':
61124 case 'y':
61125 case 'x':
61126 case 'v':
61127 case 'l':
61128 case 'k': // AVX512 masking registers.
61129 return C_RegisterClass;
61130 case 'a':
61131 case 'b':
61132 case 'c':
61133 case 'd':
61134 case 'S':
61135 case 'D':
61136 case 'A':
61137 return C_Register;
61138 case 'I':
61139 case 'J':
61140 case 'K':
61141 case 'N':
61142 case 'G':
61143 case 'L':
61144 case 'M':
61145 return C_Immediate;
61146 case 'C':
61147 case 'e':
61148 case 'Z':
61149 return C_Other;
61150 default:
61151 break;
61152 }
61153 }
61154 else if (Constraint.size() == 2) {
61155 switch (Constraint[0]) {
61156 default:
61157 break;
61158 case 'W':
61159 if (Constraint[1] != 's')
61160 break;
61161 return C_Other;
61162 case 'Y':
61163 switch (Constraint[1]) {
61164 default:
61165 break;
61166 case 'z':
61167 return C_Register;
61168 case 'i':
61169 case 'm':
61170 case 'k':
61171 case 't':
61172 case '2':
61173 return C_RegisterClass;
61174 }
61175 break;
61176 case 'j':
61177 switch (Constraint[1]) {
61178 default:
61179 break;
61180 case 'r':
61181 case 'R':
61182 return C_RegisterClass;
61183 }
61184 }
61185 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61186 return C_Other;
61187 return TargetLowering::getConstraintType(Constraint);
61188}
61189
61190/// Examine constraint type and operand type and determine a weight value.
61191/// This object must already have been set up with the operand type
61192/// and the current alternative constraint selected.
61195 AsmOperandInfo &Info, const char *Constraint) const {
61197 Value *CallOperandVal = Info.CallOperandVal;
61198 // If we don't have a value, we can't do a match,
61199 // but allow it at the lowest weight.
61200 if (!CallOperandVal)
61201 return CW_Default;
61202 Type *Ty = CallOperandVal->getType();
61203 // Look at the constraint type.
61204 switch (*Constraint) {
61205 default:
61207 [[fallthrough]];
61208 case 'R':
61209 case 'q':
61210 case 'Q':
61211 case 'a':
61212 case 'b':
61213 case 'c':
61214 case 'd':
61215 case 'S':
61216 case 'D':
61217 case 'A':
61218 if (CallOperandVal->getType()->isIntegerTy())
61219 Wt = CW_SpecificReg;
61220 break;
61221 case 'f':
61222 case 't':
61223 case 'u':
61224 if (Ty->isFloatingPointTy())
61225 Wt = CW_SpecificReg;
61226 break;
61227 case 'y':
61228 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61229 Wt = CW_SpecificReg;
61230 break;
61231 case 'Y':
61232 if (StringRef(Constraint).size() != 2)
61233 break;
61234 switch (Constraint[1]) {
61235 default:
61236 return CW_Invalid;
61237 // XMM0
61238 case 'z':
61239 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61240 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61241 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61242 return CW_SpecificReg;
61243 return CW_Invalid;
61244 // Conditional OpMask regs (AVX512)
61245 case 'k':
61246 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61247 return CW_Register;
61248 return CW_Invalid;
61249 // Any MMX reg
61250 case 'm':
61251 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61252 return CW_SpecificReg;
61253 return CW_Invalid;
61254 // Any SSE reg when ISA >= SSE2, same as 'x'
61255 case 'i':
61256 case 't':
61257 case '2':
61258 if (!Subtarget.hasSSE2())
61259 return CW_Invalid;
61260 break;
61261 }
61262 break;
61263 case 'j':
61264 if (StringRef(Constraint).size() != 2)
61265 break;
61266 switch (Constraint[1]) {
61267 default:
61268 return CW_Invalid;
61269 case 'r':
61270 case 'R':
61271 if (CallOperandVal->getType()->isIntegerTy())
61272 Wt = CW_SpecificReg;
61273 break;
61274 }
61275 break;
61276 case 'v':
61277 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61278 Wt = CW_Register;
61279 [[fallthrough]];
61280 case 'x':
61281 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61282 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61283 Wt = CW_Register;
61284 break;
61285 case 'k':
61286 // Enable conditional vector operations using %k<#> registers.
61287 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61288 Wt = CW_Register;
61289 break;
61290 case 'I':
61291 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61292 if (C->getZExtValue() <= 31)
61293 Wt = CW_Constant;
61294 break;
61295 case 'J':
61296 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61297 if (C->getZExtValue() <= 63)
61298 Wt = CW_Constant;
61299 break;
61300 case 'K':
61301 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61302 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61303 Wt = CW_Constant;
61304 break;
61305 case 'L':
61306 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61307 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61308 Wt = CW_Constant;
61309 break;
61310 case 'M':
61311 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61312 if (C->getZExtValue() <= 3)
61313 Wt = CW_Constant;
61314 break;
61315 case 'N':
61316 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61317 if (C->getZExtValue() <= 0xff)
61318 Wt = CW_Constant;
61319 break;
61320 case 'G':
61321 case 'C':
61322 if (isa<ConstantFP>(CallOperandVal))
61323 Wt = CW_Constant;
61324 break;
61325 case 'e':
61326 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61327 if ((C->getSExtValue() >= -0x80000000LL) &&
61328 (C->getSExtValue() <= 0x7fffffffLL))
61329 Wt = CW_Constant;
61330 break;
61331 case 'Z':
61332 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61333 if (C->getZExtValue() <= 0xffffffff)
61334 Wt = CW_Constant;
61335 break;
61336 }
61337 return Wt;
61338}
61339
61340/// Try to replace an X constraint, which matches anything, with another that
61341/// has more specific requirements based on the type of the corresponding
61342/// operand.
61344LowerXConstraint(EVT ConstraintVT) const {
61345 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61346 // 'f' like normal targets.
61347 if (ConstraintVT.isFloatingPoint()) {
61348 if (Subtarget.hasSSE1())
61349 return "x";
61350 }
61351
61352 return TargetLowering::LowerXConstraint(ConstraintVT);
61353}
61354
61355// Lower @cc targets via setcc.
61357 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61358 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61359 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61360 if (Cond == X86::COND_INVALID)
61361 return SDValue();
61362 // Check that return type is valid.
61363 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61364 OpInfo.ConstraintVT.getSizeInBits() < 8)
61365 report_fatal_error("Glue output operand is of invalid type");
61366
61367 // Get EFLAGS register. Only update chain when copyfrom is glued.
61368 if (Glue.getNode()) {
61369 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61370 Chain = Glue.getValue(1);
61371 } else
61372 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61373 // Extract CC code.
61374 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61375 // Extend to 32-bits
61376 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61377
61378 return Result;
61379}
61380
61381/// Lower the specified operand into the Ops vector.
61382/// If it is invalid, don't add anything to Ops.
61384 StringRef Constraint,
61385 std::vector<SDValue> &Ops,
61386 SelectionDAG &DAG) const {
61387 SDValue Result;
61388 char ConstraintLetter = Constraint[0];
61389 switch (ConstraintLetter) {
61390 default: break;
61391 case 'I':
61392 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61393 if (C->getZExtValue() <= 31) {
61394 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61395 Op.getValueType());
61396 break;
61397 }
61398 }
61399 return;
61400 case 'J':
61401 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61402 if (C->getZExtValue() <= 63) {
61403 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61404 Op.getValueType());
61405 break;
61406 }
61407 }
61408 return;
61409 case 'K':
61410 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61411 if (isInt<8>(C->getSExtValue())) {
61412 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61413 Op.getValueType());
61414 break;
61415 }
61416 }
61417 return;
61418 case 'L':
61419 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61420 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61421 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61422 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61423 Op.getValueType());
61424 break;
61425 }
61426 }
61427 return;
61428 case 'M':
61429 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61430 if (C->getZExtValue() <= 3) {
61431 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61432 Op.getValueType());
61433 break;
61434 }
61435 }
61436 return;
61437 case 'N':
61438 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61439 if (C->getZExtValue() <= 255) {
61440 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61441 Op.getValueType());
61442 break;
61443 }
61444 }
61445 return;
61446 case 'O':
61447 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61448 if (C->getZExtValue() <= 127) {
61449 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61450 Op.getValueType());
61451 break;
61452 }
61453 }
61454 return;
61455 case 'e': {
61456 // 32-bit signed value
61457 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61459 C->getSExtValue())) {
61460 // Widen to 64 bits here to get it sign extended.
61461 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61462 break;
61463 }
61464 // FIXME gcc accepts some relocatable values here too, but only in certain
61465 // memory models; it's complicated.
61466 }
61467 return;
61468 }
61469 case 'W': {
61470 assert(Constraint[1] == 's');
61471 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61472 // offset.
61473 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61474 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61475 BA->getValueType(0)));
61476 } else {
61477 int64_t Offset = 0;
61478 if (Op->getOpcode() == ISD::ADD &&
61479 isa<ConstantSDNode>(Op->getOperand(1))) {
61480 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61481 Op = Op->getOperand(0);
61482 }
61483 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61484 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61485 GA->getValueType(0), Offset));
61486 }
61487 return;
61488 }
61489 case 'Z': {
61490 // 32-bit unsigned value
61491 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61493 C->getZExtValue())) {
61494 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61495 Op.getValueType());
61496 break;
61497 }
61498 }
61499 // FIXME gcc accepts some relocatable values here too, but only in certain
61500 // memory models; it's complicated.
61501 return;
61502 }
61503 case 'i': {
61504 // Literal immediates are always ok.
61505 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61506 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61507 BooleanContent BCont = getBooleanContents(MVT::i64);
61508 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61510 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61511 : CST->getSExtValue();
61512 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61513 break;
61514 }
61515
61516 // In any sort of PIC mode addresses need to be computed at runtime by
61517 // adding in a register or some sort of table lookup. These can't
61518 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61519 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61521 return;
61522
61523 // If we are in non-pic codegen mode, we allow the address of a global (with
61524 // an optional displacement) to be used with 'i'.
61525 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61526 // If we require an extra load to get this address, as in PIC mode, we
61527 // can't accept it.
61529 Subtarget.classifyGlobalReference(GA->getGlobal())))
61530 return;
61531 break;
61532 }
61533 }
61534
61535 if (Result.getNode()) {
61536 Ops.push_back(Result);
61537 return;
61538 }
61539 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61540}
61541
61542/// Check if \p RC is a general purpose register class.
61543/// I.e., GR* or one of their variant.
61544static bool isGRClass(const TargetRegisterClass &RC) {
61545 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61546 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61547 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61548 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61549 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61550}
61551
61552/// Check if \p RC is a vector register class.
61553/// I.e., FR* / VR* or one of their variant.
61554static bool isFRClass(const TargetRegisterClass &RC) {
61555 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61556 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61557 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61558 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61559 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61560 RC.hasSuperClassEq(&X86::VR512RegClass);
61561}
61562
61563/// Check if \p RC is a mask register class.
61564/// I.e., VK* or one of their variant.
61565static bool isVKClass(const TargetRegisterClass &RC) {
61566 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61567 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61568 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61569 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61570 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61571 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61572 RC.hasSuperClassEq(&X86::VK64RegClass);
61573}
61574
61575static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61576 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61577}
61578
61579std::pair<unsigned, const TargetRegisterClass *>
61581 StringRef Constraint,
61582 MVT VT) const {
61583 // First, see if this is a constraint that directly corresponds to an LLVM
61584 // register class.
61585 if (Constraint.size() == 1) {
61586 // GCC Constraint Letters
61587 switch (Constraint[0]) {
61588 default: break;
61589 // 'A' means [ER]AX + [ER]DX.
61590 case 'A':
61591 if (Subtarget.is64Bit())
61592 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61593 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61594 "Expecting 64, 32 or 16 bit subtarget");
61595 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61596
61597 // TODO: Slight differences here in allocation order and leaving
61598 // RIP in the class. Do they matter any more here than they do
61599 // in the normal allocation?
61600 case 'k':
61601 if (Subtarget.hasAVX512()) {
61602 if (VT == MVT::v1i1 || VT == MVT::i1)
61603 return std::make_pair(0U, &X86::VK1RegClass);
61604 if (VT == MVT::v8i1 || VT == MVT::i8)
61605 return std::make_pair(0U, &X86::VK8RegClass);
61606 if (VT == MVT::v16i1 || VT == MVT::i16)
61607 return std::make_pair(0U, &X86::VK16RegClass);
61608 }
61609 if (Subtarget.hasBWI()) {
61610 if (VT == MVT::v32i1 || VT == MVT::i32)
61611 return std::make_pair(0U, &X86::VK32RegClass);
61612 if (VT == MVT::v64i1 || VT == MVT::i64)
61613 return std::make_pair(0U, &X86::VK64RegClass);
61614 }
61615 break;
61616 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61617 if (Subtarget.is64Bit()) {
61618 if (VT == MVT::i8 || VT == MVT::i1)
61619 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61620 ? &X86::GR8RegClass
61621 : &X86::GR8_NOREX2RegClass);
61622 if (VT == MVT::i16)
61623 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61624 ? &X86::GR16RegClass
61625 : &X86::GR16_NOREX2RegClass);
61626 if (VT == MVT::i32 || VT == MVT::f32)
61627 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61628 ? &X86::GR32RegClass
61629 : &X86::GR32_NOREX2RegClass);
61630 if (VT != MVT::f80 && !VT.isVector())
61631 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61632 ? &X86::GR64RegClass
61633 : &X86::GR64_NOREX2RegClass);
61634 break;
61635 }
61636 [[fallthrough]];
61637 // 32-bit fallthrough
61638 case 'Q': // Q_REGS
61639 if (VT == MVT::i8 || VT == MVT::i1)
61640 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61641 if (VT == MVT::i16)
61642 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61643 if (VT == MVT::i32 || VT == MVT::f32 ||
61644 (!VT.isVector() && !Subtarget.is64Bit()))
61645 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61646 if (VT != MVT::f80 && !VT.isVector())
61647 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61648 break;
61649 case 'r': // GENERAL_REGS
61650 case 'l': // INDEX_REGS
61651 if (VT == MVT::i8 || VT == MVT::i1)
61652 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61653 ? &X86::GR8RegClass
61654 : &X86::GR8_NOREX2RegClass);
61655 if (VT == MVT::i16)
61656 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61657 ? &X86::GR16RegClass
61658 : &X86::GR16_NOREX2RegClass);
61659 if (VT == MVT::i32 || VT == MVT::f32 ||
61660 (!VT.isVector() && !Subtarget.is64Bit()))
61661 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61662 ? &X86::GR32RegClass
61663 : &X86::GR32_NOREX2RegClass);
61664 if (VT != MVT::f80 && !VT.isVector())
61665 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61666 ? &X86::GR64RegClass
61667 : &X86::GR64_NOREX2RegClass);
61668 break;
61669 case 'R': // LEGACY_REGS
61670 if (VT == MVT::i8 || VT == MVT::i1)
61671 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61672 if (VT == MVT::i16)
61673 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61674 if (VT == MVT::i32 || VT == MVT::f32 ||
61675 (!VT.isVector() && !Subtarget.is64Bit()))
61676 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61677 if (VT != MVT::f80 && !VT.isVector())
61678 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61679 break;
61680 case 'f': // FP Stack registers.
61681 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61682 // value to the correct fpstack register class.
61683 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61684 return std::make_pair(0U, &X86::RFP32RegClass);
61685 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61686 return std::make_pair(0U, &X86::RFP64RegClass);
61687 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61688 return std::make_pair(0U, &X86::RFP80RegClass);
61689 break;
61690 case 'y': // MMX_REGS if MMX allowed.
61691 if (!Subtarget.hasMMX()) break;
61692 return std::make_pair(0U, &X86::VR64RegClass);
61693 case 'v':
61694 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61695 if (!Subtarget.hasSSE1()) break;
61696 bool VConstraint = (Constraint[0] == 'v');
61697
61698 switch (VT.SimpleTy) {
61699 default: break;
61700 // Scalar SSE types.
61701 case MVT::f16:
61702 if (VConstraint && Subtarget.hasFP16())
61703 return std::make_pair(0U, &X86::FR16XRegClass);
61704 break;
61705 case MVT::f32:
61706 case MVT::i32:
61707 if (VConstraint && Subtarget.hasVLX())
61708 return std::make_pair(0U, &X86::FR32XRegClass);
61709 return std::make_pair(0U, &X86::FR32RegClass);
61710 case MVT::f64:
61711 case MVT::i64:
61712 if (VConstraint && Subtarget.hasVLX())
61713 return std::make_pair(0U, &X86::FR64XRegClass);
61714 return std::make_pair(0U, &X86::FR64RegClass);
61715 case MVT::i128:
61716 if (Subtarget.is64Bit()) {
61717 if (VConstraint && Subtarget.hasVLX())
61718 return std::make_pair(0U, &X86::VR128XRegClass);
61719 return std::make_pair(0U, &X86::VR128RegClass);
61720 }
61721 break;
61722 // Vector types and fp128.
61723 case MVT::v8f16:
61724 if (!Subtarget.hasFP16())
61725 break;
61726 if (VConstraint)
61727 return std::make_pair(0U, &X86::VR128XRegClass);
61728 return std::make_pair(0U, &X86::VR128RegClass);
61729 case MVT::v8bf16:
61730 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61731 break;
61732 if (VConstraint)
61733 return std::make_pair(0U, &X86::VR128XRegClass);
61734 return std::make_pair(0U, &X86::VR128RegClass);
61735 case MVT::f128:
61736 if (!Subtarget.is64Bit())
61737 break;
61738 [[fallthrough]];
61739 case MVT::v16i8:
61740 case MVT::v8i16:
61741 case MVT::v4i32:
61742 case MVT::v2i64:
61743 case MVT::v4f32:
61744 case MVT::v2f64:
61745 if (VConstraint && Subtarget.hasVLX())
61746 return std::make_pair(0U, &X86::VR128XRegClass);
61747 return std::make_pair(0U, &X86::VR128RegClass);
61748 // AVX types.
61749 case MVT::v16f16:
61750 if (!Subtarget.hasFP16())
61751 break;
61752 if (VConstraint)
61753 return std::make_pair(0U, &X86::VR256XRegClass);
61754 return std::make_pair(0U, &X86::VR256RegClass);
61755 case MVT::v16bf16:
61756 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61757 break;
61758 if (VConstraint)
61759 return std::make_pair(0U, &X86::VR256XRegClass);
61760 return std::make_pair(0U, &X86::VR256RegClass);
61761 case MVT::v32i8:
61762 case MVT::v16i16:
61763 case MVT::v8i32:
61764 case MVT::v4i64:
61765 case MVT::v8f32:
61766 case MVT::v4f64:
61767 if (VConstraint && Subtarget.hasVLX())
61768 return std::make_pair(0U, &X86::VR256XRegClass);
61769 if (Subtarget.hasAVX())
61770 return std::make_pair(0U, &X86::VR256RegClass);
61771 break;
61772 case MVT::v32f16:
61773 if (!Subtarget.hasFP16())
61774 break;
61775 if (VConstraint)
61776 return std::make_pair(0U, &X86::VR512RegClass);
61777 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61778 case MVT::v32bf16:
61779 if (!Subtarget.hasBF16())
61780 break;
61781 if (VConstraint)
61782 return std::make_pair(0U, &X86::VR512RegClass);
61783 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61784 case MVT::v64i8:
61785 case MVT::v32i16:
61786 case MVT::v8f64:
61787 case MVT::v16f32:
61788 case MVT::v16i32:
61789 case MVT::v8i64:
61790 if (!Subtarget.hasAVX512()) break;
61791 if (VConstraint)
61792 return std::make_pair(0U, &X86::VR512RegClass);
61793 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61794 }
61795 break;
61796 }
61797 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61798 switch (Constraint[1]) {
61799 default:
61800 break;
61801 case 'i':
61802 case 't':
61803 case '2':
61804 return getRegForInlineAsmConstraint(TRI, "x", VT);
61805 case 'm':
61806 if (!Subtarget.hasMMX()) break;
61807 return std::make_pair(0U, &X86::VR64RegClass);
61808 case 'z':
61809 if (!Subtarget.hasSSE1()) break;
61810 switch (VT.SimpleTy) {
61811 default: break;
61812 // Scalar SSE types.
61813 case MVT::f16:
61814 if (!Subtarget.hasFP16())
61815 break;
61816 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61817 case MVT::f32:
61818 case MVT::i32:
61819 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61820 case MVT::f64:
61821 case MVT::i64:
61822 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61823 case MVT::v8f16:
61824 if (!Subtarget.hasFP16())
61825 break;
61826 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61827 case MVT::v8bf16:
61828 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61829 break;
61830 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61831 case MVT::f128:
61832 case MVT::v16i8:
61833 case MVT::v8i16:
61834 case MVT::v4i32:
61835 case MVT::v2i64:
61836 case MVT::v4f32:
61837 case MVT::v2f64:
61838 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61839 // AVX types.
61840 case MVT::v16f16:
61841 if (!Subtarget.hasFP16())
61842 break;
61843 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61844 case MVT::v16bf16:
61845 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61846 break;
61847 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61848 case MVT::v32i8:
61849 case MVT::v16i16:
61850 case MVT::v8i32:
61851 case MVT::v4i64:
61852 case MVT::v8f32:
61853 case MVT::v4f64:
61854 if (Subtarget.hasAVX())
61855 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61856 break;
61857 case MVT::v32f16:
61858 if (!Subtarget.hasFP16())
61859 break;
61860 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61861 case MVT::v32bf16:
61862 if (!Subtarget.hasBF16())
61863 break;
61864 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61865 case MVT::v64i8:
61866 case MVT::v32i16:
61867 case MVT::v8f64:
61868 case MVT::v16f32:
61869 case MVT::v16i32:
61870 case MVT::v8i64:
61871 if (Subtarget.hasAVX512())
61872 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61873 break;
61874 }
61875 break;
61876 case 'k':
61877 // This register class doesn't allocate k0 for masked vector operation.
61878 if (Subtarget.hasAVX512()) {
61879 if (VT == MVT::v1i1 || VT == MVT::i1)
61880 return std::make_pair(0U, &X86::VK1WMRegClass);
61881 if (VT == MVT::v8i1 || VT == MVT::i8)
61882 return std::make_pair(0U, &X86::VK8WMRegClass);
61883 if (VT == MVT::v16i1 || VT == MVT::i16)
61884 return std::make_pair(0U, &X86::VK16WMRegClass);
61885 }
61886 if (Subtarget.hasBWI()) {
61887 if (VT == MVT::v32i1 || VT == MVT::i32)
61888 return std::make_pair(0U, &X86::VK32WMRegClass);
61889 if (VT == MVT::v64i1 || VT == MVT::i64)
61890 return std::make_pair(0U, &X86::VK64WMRegClass);
61891 }
61892 break;
61893 }
61894 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61895 switch (Constraint[1]) {
61896 default:
61897 break;
61898 case 'r':
61899 if (VT == MVT::i8 || VT == MVT::i1)
61900 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
61901 if (VT == MVT::i16)
61902 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
61903 if (VT == MVT::i32 || VT == MVT::f32)
61904 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
61905 if (VT != MVT::f80 && !VT.isVector())
61906 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
61907 break;
61908 case 'R':
61909 if (VT == MVT::i8 || VT == MVT::i1)
61910 return std::make_pair(0U, &X86::GR8RegClass);
61911 if (VT == MVT::i16)
61912 return std::make_pair(0U, &X86::GR16RegClass);
61913 if (VT == MVT::i32 || VT == MVT::f32)
61914 return std::make_pair(0U, &X86::GR32RegClass);
61915 if (VT != MVT::f80 && !VT.isVector())
61916 return std::make_pair(0U, &X86::GR64RegClass);
61917 break;
61918 }
61919 }
61920
61921 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61922 return std::make_pair(0U, &X86::GR32RegClass);
61923
61924 // Use the default implementation in TargetLowering to convert the register
61925 // constraint into a member of a register class.
61926 std::pair<Register, const TargetRegisterClass*> Res;
61928
61929 // Not found as a standard register?
61930 if (!Res.second) {
61931 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
61932 // to/from f80.
61933 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
61934 // Map st(0) -> st(7) -> ST0
61935 if (Constraint.size() == 7 && Constraint[0] == '{' &&
61936 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
61937 Constraint[3] == '(' &&
61938 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
61939 Constraint[5] == ')' && Constraint[6] == '}') {
61940 // st(7) is not allocatable and thus not a member of RFP80. Return
61941 // singleton class in cases where we have a reference to it.
61942 if (Constraint[4] == '7')
61943 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
61944 return std::make_pair(X86::FP0 + Constraint[4] - '0',
61945 &X86::RFP80RegClass);
61946 }
61947
61948 // GCC allows "st(0)" to be called just plain "st".
61949 if (StringRef("{st}").equals_insensitive(Constraint))
61950 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
61951 }
61952
61953 // flags -> EFLAGS
61954 if (StringRef("{flags}").equals_insensitive(Constraint))
61955 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
61956
61957 // dirflag -> DF
61958 // Only allow for clobber.
61959 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
61960 VT == MVT::Other)
61961 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
61962
61963 // fpsr -> FPSW
61964 // Only allow for clobber.
61965 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
61966 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
61967
61968 return Res;
61969 }
61970
61971 // Make sure it isn't a register that requires 64-bit mode.
61972 if (!Subtarget.is64Bit() &&
61973 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
61974 TRI->getEncodingValue(Res.first) >= 8) {
61975 // Register requires REX prefix, but we're in 32-bit mode.
61976 return std::make_pair(0, nullptr);
61977 }
61978
61979 // Make sure it isn't a register that requires AVX512.
61980 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
61981 TRI->getEncodingValue(Res.first) & 0x10) {
61982 // Register requires EVEX prefix.
61983 return std::make_pair(0, nullptr);
61984 }
61985
61986 // Otherwise, check to see if this is a register class of the wrong value
61987 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
61988 // turn into {ax},{dx}.
61989 // MVT::Other is used to specify clobber names.
61990 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
61991 return Res; // Correct type already, nothing to do.
61992
61993 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
61994 // return "eax". This should even work for things like getting 64bit integer
61995 // registers when given an f64 type.
61996 const TargetRegisterClass *Class = Res.second;
61997 // The generic code will match the first register class that contains the
61998 // given register. Thus, based on the ordering of the tablegened file,
61999 // the "plain" GR classes might not come first.
62000 // Therefore, use a helper method.
62001 if (isGRClass(*Class)) {
62002 unsigned Size = VT.getSizeInBits();
62003 if (Size == 1) Size = 8;
62004 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62005 return std::make_pair(0, nullptr);
62006 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62007 if (DestReg.isValid()) {
62008 bool is64Bit = Subtarget.is64Bit();
62009 const TargetRegisterClass *RC =
62010 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62011 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62012 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62013 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62014 if (Size == 64 && !is64Bit) {
62015 // Model GCC's behavior here and select a fixed pair of 32-bit
62016 // registers.
62017 switch (DestReg) {
62018 case X86::RAX:
62019 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62020 case X86::RDX:
62021 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62022 case X86::RCX:
62023 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62024 case X86::RBX:
62025 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62026 case X86::RSI:
62027 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62028 case X86::RDI:
62029 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62030 case X86::RBP:
62031 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62032 default:
62033 return std::make_pair(0, nullptr);
62034 }
62035 }
62036 if (RC && RC->contains(DestReg))
62037 return std::make_pair(DestReg, RC);
62038 return Res;
62039 }
62040 // No register found/type mismatch.
62041 return std::make_pair(0, nullptr);
62042 } else if (isFRClass(*Class)) {
62043 // Handle references to XMM physical registers that got mapped into the
62044 // wrong class. This can happen with constraints like {xmm0} where the
62045 // target independent register mapper will just pick the first match it can
62046 // find, ignoring the required type.
62047
62048 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62049 if (VT == MVT::f16)
62050 Res.second = &X86::FR16XRegClass;
62051 else if (VT == MVT::f32 || VT == MVT::i32)
62052 Res.second = &X86::FR32XRegClass;
62053 else if (VT == MVT::f64 || VT == MVT::i64)
62054 Res.second = &X86::FR64XRegClass;
62055 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62056 Res.second = &X86::VR128XRegClass;
62057 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62058 Res.second = &X86::VR256XRegClass;
62059 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62060 Res.second = &X86::VR512RegClass;
62061 else {
62062 // Type mismatch and not a clobber: Return an error;
62063 Res.first = 0;
62064 Res.second = nullptr;
62065 }
62066 } else if (isVKClass(*Class)) {
62067 if (VT == MVT::v1i1 || VT == MVT::i1)
62068 Res.second = &X86::VK1RegClass;
62069 else if (VT == MVT::v8i1 || VT == MVT::i8)
62070 Res.second = &X86::VK8RegClass;
62071 else if (VT == MVT::v16i1 || VT == MVT::i16)
62072 Res.second = &X86::VK16RegClass;
62073 else if (VT == MVT::v32i1 || VT == MVT::i32)
62074 Res.second = &X86::VK32RegClass;
62075 else if (VT == MVT::v64i1 || VT == MVT::i64)
62076 Res.second = &X86::VK64RegClass;
62077 else {
62078 // Type mismatch and not a clobber: Return an error;
62079 Res.first = 0;
62080 Res.second = nullptr;
62081 }
62082 }
62083
62084 return Res;
62085}
62086
62087bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62088 // Integer division on x86 is expensive. However, when aggressively optimizing
62089 // for code size, we prefer to use a div instruction, as it is usually smaller
62090 // than the alternative sequence.
62091 // The exception to this is vector division. Since x86 doesn't have vector
62092 // integer division, leaving the division as-is is a loss even in terms of
62093 // size, because it will have to be scalarized, while the alternative code
62094 // sequence can be performed in vector form.
62095 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62096 return OptSize && !VT.isVector();
62097}
62098
62099void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62100 if (!Subtarget.is64Bit())
62101 return;
62102
62103 // Update IsSplitCSR in X86MachineFunctionInfo.
62105 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62106 AFI->setIsSplitCSR(true);
62107}
62108
62109void X86TargetLowering::insertCopiesSplitCSR(
62110 MachineBasicBlock *Entry,
62111 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62112 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62113 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62114 if (!IStart)
62115 return;
62116
62117 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62118 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62119 MachineBasicBlock::iterator MBBI = Entry->begin();
62120 for (const MCPhysReg *I = IStart; *I; ++I) {
62121 const TargetRegisterClass *RC = nullptr;
62122 if (X86::GR64RegClass.contains(*I))
62123 RC = &X86::GR64RegClass;
62124 else
62125 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62126
62127 Register NewVR = MRI->createVirtualRegister(RC);
62128 // Create copy from CSR to a virtual register.
62129 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62130 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62131 // nounwind. If we want to generalize this later, we may need to emit
62132 // CFI pseudo-instructions.
62133 assert(
62134 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62135 "Function should be nounwind in insertCopiesSplitCSR!");
62136 Entry->addLiveIn(*I);
62137 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62138 .addReg(*I);
62139
62140 // Insert the copy-back instructions right before the terminator.
62141 for (auto *Exit : Exits)
62142 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62143 TII->get(TargetOpcode::COPY), *I)
62144 .addReg(NewVR);
62145 }
62146}
62147
62149 return Subtarget.is64Bit();
62150}
62151
62155 const TargetInstrInfo *TII) const {
62156 assert(MBBI->isCall() && MBBI->getCFIType() &&
62157 "Invalid call instruction for a KCFI check");
62158
62159 MachineFunction &MF = *MBB.getParent();
62160 // If the call target is a memory operand, unfold it and use R11 for the
62161 // call, so KCFI_CHECK won't have to recompute the address.
62162 switch (MBBI->getOpcode()) {
62163 case X86::CALL64m:
62164 case X86::CALL64m_NT:
62165 case X86::TAILJMPm64:
62166 case X86::TAILJMPm64_REX: {
62169 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62170 /*UnfoldStore=*/false, NewMIs))
62171 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62172 for (auto *NewMI : NewMIs)
62173 MBBI = MBB.insert(OrigCall, NewMI);
62174 assert(MBBI->isCall() &&
62175 "Unexpected instruction after memory operand unfolding");
62176 if (OrigCall->shouldUpdateAdditionalCallInfo())
62177 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62178 MBBI->setCFIType(MF, OrigCall->getCFIType());
62179 OrigCall->eraseFromParent();
62180 break;
62181 }
62182 default:
62183 break;
62184 }
62185
62186 MachineOperand &Target = MBBI->getOperand(0);
62187 Register TargetReg;
62188 switch (MBBI->getOpcode()) {
62189 case X86::CALL64r:
62190 case X86::CALL64r_ImpCall:
62191 case X86::CALL64r_NT:
62192 case X86::TAILJMPr64:
62193 case X86::TAILJMPr64_REX:
62194 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62195 Target.setIsRenamable(false);
62196 TargetReg = Target.getReg();
62197 break;
62198 case X86::CALL64pcrel32:
62199 case X86::TAILJMPd64:
62200 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62201 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62202 // 64-bit indirect thunk calls.
62203 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62204 "Unexpected register for an indirect thunk call");
62205 TargetReg = X86::R11;
62206 break;
62207 default:
62208 llvm_unreachable("Unexpected CFI call opcode");
62209 break;
62210 }
62211
62212 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62213 .addReg(TargetReg)
62214 .addImm(MBBI->getCFIType())
62215 .getInstr();
62216}
62217
62218/// Returns true if stack probing through a function call is requested.
62222
62223/// Returns true if stack probing through inline assembly is requested.
62225
62226 // No inline stack probe for Windows, they have their own mechanism.
62227 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62228 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62229 return false;
62230
62231 // If the function specifically requests inline stack probes, emit them.
62232 if (MF.getFunction().hasFnAttribute("probe-stack"))
62233 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62234 "inline-asm";
62235
62236 return false;
62237}
62238
62239/// Returns the name of the symbol used to emit stack probes or the empty
62240/// string if not applicable.
62243 // Inline Stack probes disable stack probe call
62244 if (hasInlineStackProbe(MF))
62245 return "";
62246
62247 // If the function specifically requests stack probes, emit them.
62248 if (MF.getFunction().hasFnAttribute("probe-stack"))
62249 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62250
62251 // Generally, if we aren't on Windows, the platform ABI does not include
62252 // support for stack probes, so don't emit them.
62253 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62254 Subtarget.isTargetMachO() ||
62255 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62256 return "";
62257
62258 // We need a stack probe to conform to the Windows ABI. Choose the right
62259 // symbol.
62260 if (Subtarget.is64Bit())
62261 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62262 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62263}
62264
62265unsigned
62267 // The default stack probe size is 4096 if the function has no stackprobesize
62268 // attribute.
62269 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62270 4096);
62271}
62272
62274 if (ML && ML->isInnermost() &&
62275 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62278}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6082
void clearSign()
Definition APFloat.h:1298
opStatus next(bool nextDown)
Definition APFloat.h:1254
void changeSign()
Definition APFloat.h:1297
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:162
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:170
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:188
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
unsigned size() const
Definition DenseMap.h:108
bool empty() const
Definition DenseMap.h:107
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:424
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:151
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:154
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:281
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:180
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:774
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:169
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:525
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:892
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:706
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:809
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:663
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:881
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:701
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:690
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:903
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:927
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:713
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:47
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1733
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1665
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1607
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:293
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2058
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1589
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:314
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1759
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:348
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1721
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1976
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1842
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1936
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1817
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1943
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2090
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1592
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:299
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
#define EQ(a, b)
Definition regexec.c:65
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static constexpr roundingMode rmTowardZero
Definition APFloat.h:308
static LLVM_ABI const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition APFloat.cpp:289
static LLVM_ABI const fltSemantics & IEEEquad() LLVM_READNONE
Definition APFloat.cpp:268
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:324
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:320
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:294
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:179
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:101
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:267
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:154
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:282
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:165
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:104
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:218
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:304
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:173
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:340
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:189
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:98
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.